diff --git a/CMakeLists.txt b/CMakeLists.txt index e85fce58368aa233e39a554947e20a128fce6218..61f5e63098c40f140774ba6bfd9a2de8d2d67bfb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,12 +25,18 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") if(WIN32) + set(CMAKE_SUPPRESS_REGENERATION ON) set(CMAKE_STATIC_LIBRARY_PREFIX lib) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838) + set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221") + set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") endif(WIN32) find_package(CUDA QUIET) @@ -212,7 +218,7 @@ endif() if (WITH_JEMALLOC) find_package(JeMalloc REQUIRED) include_directories(${JEMALLOC_INCLUDE_DIR}) - add_definitions(-DWITH_JEMALLOC) + add_definitions(-DPADDLE_WITH_JEMALLOC) endif() include(generic) # simplify cmake module diff --git a/README.md b/README.md index 32a302cc5431a62b310d4812b545bd929f090e0a..68421cf177f4cd15f8f44e8d00a27cafb5a13b91 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # PaddlePaddle +English | [简体中文](./README_cn.md) [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) @@ -7,7 +8,6 @@ [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) - Welcome to the PaddlePaddle GitHub. PaddlePaddle (PArallel Distributed Deep LEarning) is an easy-to-use, @@ -18,16 +18,6 @@ learning to many products at Baidu. Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. - -欢迎来到 PaddlePaddle GitHub - -PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台,最初由百度科学家和工程师共同开发,目的是将深度学习技术应用到百度的众多产品中。 - -我们的愿景是让每个人都能通过PaddlePaddle接触深度学习 - -跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) - - ### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) ### Install Latest Stable Release: ``` @@ -43,23 +33,6 @@ pip install paddlepaddle-gpu==1.2.0.post85 # For installation on other platform, refer to http://paddlepaddle.org/ ``` - -### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) -### 安装最新稳定版本: -``` -# Linux CPU -pip install paddlepaddle -# Linux GPU cuda9cudnn7 -pip install paddlepaddle-gpu -# Linux GPU cuda8cudnn7 -pip install paddlepaddle-gpu==1.2.0.post87 -# Linux GPU cuda8cudnn5 -pip install paddlepaddle-gpu==1.2.0.post85 - -# 其他平台上的安装指引请参考 http://paddlepaddle.org/ -``` - - ## Features - **Flexibility** @@ -100,38 +73,10 @@ pip install paddlepaddle-gpu==1.2.0.post85 Baidu and it has achieved a significant impact. We hope you can also explore the capability of PaddlePaddle to make an impact on your product. -## 特点 - -- **灵活性** - - PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型,例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。 - -- **高效性** - - 为了高效使用异步计算资源,PaddlePaddle对框架的不同层进行优化,包括计算、存储、架构和通信。下面是一些样例: - - - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。 - - 通过MKL-DNN库优化CNN网络 - - 高度优化循环网络,无需执行 `padding` 操作即可处理 **变长** 序列 - - 针对高维稀疏数据模型,优化了局部和分布式训练。 - - -- **稳定性** - - 有了 PaddlePaddle,使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。 - -- **连接产品** - - 另外,PaddlePaddle 的设计也易于部署。在百度,PaddlePaddle 已经部署到含有巨大用户量的产品和服务上,包括广告点击率(CTR)预测、大规模图像分类、光学字符识别(OCR)、搜索排序,计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中,产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力,为您的产品创造新的影响力和效果。 - ## Installation It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website. -## 安装 - -推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) - ## Documentation We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and @@ -153,37 +98,9 @@ We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarte We appreciate your contributions! -## 文档 - -我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和 -[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档 - -- [深度学习101](https://github.com/PaddlePaddle/book) - - 或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行 - -- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) - - 可以在MPI集群上运行分布式训练任务 - -- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) - - 新的API支持代码更少更简洁的程序 - -- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) - - 欢迎您的贡献! - ## Ask Questions You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues). -## 答疑 - -欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交 - ## Copyright and License PaddlePaddle is provided under the [Apache-2.0 license](LICENSE). - -## 版权和许可证 -PaddlePaddle由[Apache-2.0 license](LICENSE)提供 diff --git a/README_cn.md b/README_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..dfb55b17ca4fd05ce5b7b85b2e26e4f7f7229763 --- /dev/null +++ b/README_cn.md @@ -0,0 +1,88 @@ +# PaddlePaddle + +[English](./README.md) | 简体中文 + +[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) +[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) +[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) + +欢迎来到 PaddlePaddle GitHub + +PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台,最初由百度科学家和工程师共同开发,目的是将深度学习技术应用到百度的众多产品中。 + +我们的愿景是让每个人都能通过PaddlePaddle接触深度学习 + +跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) + +### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) +### 安装最新稳定版本: +``` +# Linux CPU +pip install paddlepaddle +# Linux GPU cuda9cudnn7 +pip install paddlepaddle-gpu +# Linux GPU cuda8cudnn7 +pip install paddlepaddle-gpu==1.2.0.post87 +# Linux GPU cuda8cudnn5 +pip install paddlepaddle-gpu==1.2.0.post85 + +# 其他平台上的安装指引请参考 http://paddlepaddle.org/ +``` + +## 特性 + +- **灵活性** + + PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型,例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。 + +- **高效性** + + 为了高效使用异步计算资源,PaddlePaddle对框架的不同层进行优化,包括计算、存储、架构和通信。下面是一些样例: + + - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。 + - 通过MKL-DNN库优化CNN网络 + - 高度优化循环网络,无需执行 `padding` 操作即可处理 **变长** 序列 + - 针对高维稀疏数据模型,优化了局部和分布式训练。 + + +- **稳定性** + + 有了 PaddlePaddle,使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。 + +- **与产品相连** + + 另外,PaddlePaddle 的设计也易于部署。在百度,PaddlePaddle 已经部署到含有巨大用户量的产品和服务上,包括广告点击率(CTR)预测、大规模图像分类、光学字符识别(OCR)、搜索排序,计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中,产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力,为您的产品创造新的影响力和效果。 + +## 安装 + +推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) + +## 文档 + +我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和 +[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档 + +- [深度学习101](https://github.com/PaddlePaddle/book) + + 或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行 + +- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) + + 可以在MPI集群上运行分布式训练任务 + +- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) + + 新的API支持代码更少更简洁的程序 + +- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) + + 欢迎您的贡献! + +## 答疑 + +欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交 + +## 版权和许可证 +PaddlePaddle由[Apache-2.0 license](LICENSE)提供 diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 076e839120d98d801de4374f2f8338ebd918b88f..b0f54bf49aafb65f1a92fa95877de2cc61fc67d3 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -152,7 +152,12 @@ endif() if (WITH_MKLML AND MKLML_IOMP_LIB) message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") - set(OPENMP_FLAGS "-fopenmp") + if(WIN32) + # openmp not support well for now on windows + set(OPENMP_FLAGS "") + else(WIN32) + set(OPENMP_FLAGS "-fopenmp") + endif(WIN32) set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index ea46f6418edf1db70b2a308dd49cf2131cc89d3b..ef4192ecc98ea6de0c81c1f33320528d547b818a 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -203,25 +203,26 @@ list(APPEND CUDA_NVCC_FLAGS "-w") list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") if (NOT WIN32) -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) -elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) -elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) -elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") - # nvcc 9 does not support -Os. Use Release flags instead - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) -endif() + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) + elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) + elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) + elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") + # nvcc 9 does not support -Os. Use Release flags instead + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) + endif() else(NOT WIN32) -list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS "-g -G") - # match the cl's _ITERATOR_DEBUG_LEVEL - list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG") -elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") -else() + list(APPEND CUDA_NVCC_FLAGS "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"") + list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS "-g -G") + # match the cl's _ITERATOR_DEBUG_LEVEL + list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG") + elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") + else() message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") endif() endif(NOT WIN32) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 7a6a4523886824a67c82f9ce978de025ddb9c2cd..d3a4d69d3a05515fdf72074083470e19b4ec255c 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -20,8 +20,10 @@ SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include dire IF(WIN32) SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530") ELSE(WIN32) SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) ENDIF(WIN32) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) @@ -39,7 +41,7 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 6a7be73f09a278ab0fd29c7599a7781df3d29413..92fe76d05c7507c295b784bc37870abfc31a0a29 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -49,6 +49,8 @@ IF(NOT WIN32) SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") +ELSE() + SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc") ENDIF(NOT WIN32) ExternalProject_Add( @@ -61,7 +63,6 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake index 27d075336d556528ffaf1929c34753494692f0a0..1e01057aa606af78cd722d3619a710cb35817174 100644 --- a/cmake/external/snappy.cmake +++ b/cmake/external/snappy.cmake @@ -20,6 +20,12 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) +if(WIN32) + SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267") +else() + SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) +endif() + ExternalProject_Add( extern_snappy GIT_REPOSITORY "https://github.com/google/snappy" @@ -31,7 +37,7 @@ ExternalProject_Add( -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 9e6c47f016fe6dfd809c5b2bc88ff59d0a6b2b84..81e7868a6ad3fee16911a49ff9d1394a103706c5 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -147,12 +147,6 @@ set(GPU_COMMON_FLAGS -Wno-error=unused-function # Warnings in Numpy Header. -Wno-error=array-bounds # Warnings in Eigen::array ) - -else(NOT WIN32) -set(COMMON_FLAGS - "/w") #disable all warnings. -set(GPU_COMMON_FLAGS - "/w") #disable all warnings endif(NOT WIN32) if (APPLE) @@ -193,8 +187,7 @@ safe_set_static_flag() CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) - if(${flag_var} MATCHES "/W3") - string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "/W3") + string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}") + set(flag_var "${flag_var} /w") endforeach(flag_var) endif(WIN32) diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 59c40a0e5d18b753038f2b9301d1c9494e3901be..c2d04828564e69d7ac965881057f185194aa0475 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -52,8 +52,8 @@ function(op_library TARGET) endif() if(WITH_MKLDNN) string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}") - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc) - list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc) + list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc) endif() endif() else() diff --git a/cmake/version.cmake b/cmake/version.cmake index ac10bdf067be549fe90112aef73fd6e1fbe0ac48..dd57d4ab9969ce530f93ca1694350b1a26b5b543 100644 --- a/cmake/version.cmake +++ b/cmake/version.cmake @@ -31,8 +31,23 @@ while ("${PADDLE_VERSION}" STREQUAL "") set(tmp_version "${GIT_TAG_NAME}~1") endif() else() - # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest - set(PADDLE_VERSION "0.0.0") + execute_process( + COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version} + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} + OUTPUT_VARIABLE GIT_EXACT_TAG_NAME + RESULT_VARIABLE GIT_EXACT_TAG_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + if (NOT ${GIT_EXACT_TAG_NAME}) + # Check if current branch is tag branch + if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}") + string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME}) + else() + set(PADDLE_VERSION "0.0.0") + endif() + else() + # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest + set(PADDLE_VERSION "0.0.0") + endif() endif() else() set(PADDLE_VERSION "0.0.0") diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index afd3342768701adba4ff0040bd1c762b1cd8739d..f50a38842a21c795c979f859e88a9b16c3e54bd8 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -122,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)) paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)) -paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name'], varargs=None, keywords=None, defaults=(0, True, None)) +paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) @@ -142,10 +142,10 @@ paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)) paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)) paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)) -paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None)) +paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)) paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)) -paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)) +paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)) paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -322,9 +322,11 @@ paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_class paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) paddle.fluid.layers.generate_mask_labels ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) +paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) +paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.box_clip ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.multiclass_nms ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) @@ -360,6 +362,9 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.contrib.Calibrator.__init__ ArgSpec(args=['self'], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.contrib.Calibrator.sample_data ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.Calibrator.save_int8_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 66f11dedbaccd7febcd75fa7ade9c68b6c42022c..910318a49cea50fadd29b1427a4591abfa5d5a23 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -128,7 +128,7 @@ cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) -cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) +cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc memory_optimize_helper) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) @@ -192,6 +192,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry proto_desc) +cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper) cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index d5966ad5a97a97ec40c8a01d2d2c8ed5d7f90421..6621a59d37a670f7025507faeab5b9897794a72e 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -50,7 +50,9 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) -cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc DEPS graph graph_helper pass) +cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper) +cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) +cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) @@ -65,12 +67,12 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass) +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass inplace_op_pass) if (WITH_GPU) list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) endif() -cc_test(memory_reuse_types_test SRCS memory_reuse_types_test.cc memory_reuse_types.cc DEPS framework_proto graph) -cc_test(analysis_var_pass_test SRCS analysis_var_pass_test.cc analysis_var_pass.cc memory_reuse_types.cc DEPS framework_proto graph graph_helper op_registry pass) +cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph) +cc_test(memory_optimize_pass_test SRCS memory_optimize_pass_test.cc memory_optimize_pass.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry pass) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index ce5731a1f414e8ef6d8af22a3bb17109e82beb87..51ce9732722efa44d2489f5b77694094e58c8775 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/reduce_op_handle.h" @@ -47,6 +47,22 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { AppendPass("sequential_execution_pass"); } + // Add op fusion. + if (strategy.fuse_relu_depthwise_conv_) { + AppendPass("fuse_relu_depthwise_conv_pass"); + } + + // NOTE(dzhwinter): A note for automatical inplace. + // 1. modify program desc passes should put + // before inplace pass. + // 2. manually configured inplace should put + // before inplace_pass + + // Add automatically inplace. + if (strategy_.enable_inplace_) { + AppendPass("inplace_pass"); + } + // Add a graph viz pass to record a graph. if (!strategy_.debug_graphviz_path_.empty()) { auto viz_pass = AppendPass("graph_viz_pass"); @@ -55,10 +71,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { viz_pass->Set("graph_viz_path", new std::string(graph_path)); } - // Add op fusion. - if (strategy.fuse_relu_depthwise_conv_) { - AppendPass("fuse_relu_depthwise_conv_pass"); - } if (strategy.fuse_elewise_add_act_ops_) { auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass"); // Add a graph viz pass to record a graph. @@ -88,7 +100,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // A side-effect of that, memory optimize cannot forsee the fetched vars // , so fetchlist should be set persistable before call the Run interface. if (strategy.memory_optimize_) { - auto analysis_var_pass = AppendPass("analysis_var_pass"); + auto memory_optimize_pass = AppendPass("memory_optimize_pass"); } AppendMultiDevPass(strategy); @@ -186,8 +198,10 @@ std::unique_ptr BuildStrategy::Apply( pass->Erase("nccl_ctxs"); pass->SetNotOwned("nccl_ctxs", nctx); #endif - - } else if (pass->Type() == "analysis_var_pass") { + } else if (pass->Type() == "memory_optimize_pass") { + if (graph->Has(kAllOpDescs)) { + graph->Erase(kAllOpDescs); + } const std::vector *all_op_descs = new std::vector(main_program.Block(0).AllOps()); graph->Set>(kAllOpDescs, @@ -214,6 +228,13 @@ std::unique_ptr BuildStrategy::Apply( pass->Set>( kAllOpDescs, new std::vector(main_program.Block(0).AllOps())); + } else if (pass->Type() == "inplace_pass") { + if (graph->Has(kAllOpDescs)) { + graph->Erase(kAllOpDescs); + } + graph->Set>( + kAllOpDescs, + new std::vector(main_program.Block(0).AllOps())); } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") { if (!use_cuda) { LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on " @@ -239,9 +260,10 @@ USE_PASS(allreduce_mode_multi_devices_pass); USE_PASS(dist_multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); -USE_PASS(analysis_var_pass); +USE_PASS(memory_optimize_pass); USE_PASS(sequential_execution_pass); USE_PASS(all_reduce_deps_pass); USE_PASS(modify_op_lock_and_record_event_pass); +USE_PASS(inplace_pass); USE_PASS(lock_free_optimize_pass); USE_PASS(graph_to_program_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index cd24a3175953bf323748bf0c7e3159761c13f0a9..e3e06a5614ddee0bea342bc3608691b7a32326cc 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -80,6 +80,11 @@ struct BuildStrategy { bool memory_early_delete_{false}; + // TODO(dzhwinter): + // make enable_inplace, memory_optimize_ + // memory_early_delete_ true by default + bool enable_inplace_{false}; + bool enable_sequential_execution_{false}; bool fuse_broadcast_op_{false}; diff --git a/paddle/fluid/framework/details/graph_test_base.h b/paddle/fluid/framework/details/graph_test_base.h new file mode 100644 index 0000000000000000000000000000000000000000..126959bcd80a4677f76b7cff677a82a319f7cfb3 --- /dev/null +++ b/paddle/fluid/framework/details/graph_test_base.h @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +class DummyOp : public OperatorBase { + public: + DummyOp(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const Scope& scope, + const platform::Place& place) const override {} +}; + +class SumOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class AssignOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class SplitOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", ""); + AddOutput("Out", "").AsDuplicable(); + AddComment(""); + } +}; + +class DummyVarTypeInference : public VarTypeInference { + public: + void operator()(const OpDesc& op_desc, BlockDesc* block) const override { + auto& inputs = op_desc.Input("X"); + auto type = block->Var(inputs.front())->GetType(); + auto out_var_name = op_desc.Output("Out").front(); + block->Var(out_var_name)->SetType(type); + } +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..78c5d5b50e606daa963e728355dc1bce83cd5484 --- /dev/null +++ b/paddle/fluid/framework/details/inplace_op_pass.cc @@ -0,0 +1,433 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/inplace_op_pass.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/memory_optimize_pass.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_info.h" + +// NOTE(dzhwinter): inplace means one op output variable reuse the input space. +// By our design, one operator only can read its input(const Variable), +// write its output(non-const Variable). If one operator is inplaced, means +// user have chance to write the space before reading happens. +// Especially when some optimize code writing style is applied. +// +// +// /* wrong case in operator */ +// /*In this case, a larger allocation is allocated, input content is lost*/ +// const Tensor* in = ctx.Input("In") +// Tensor* out = ctx.Output("Out"); +// auto* out_ptr = out->mutable_data(ctx.GetPlace()); +// out_ptr[0] = 0; // input contect is overwrited. + +// NOTE(dzhwinter): +// Only for backward compacity and stable. if enable_inplace_whitelist is turn +// on. +// only the ops in whitelist will be use inplace strategy. +// if not, all the op will be inplaced if it registered with InplaceClass +DEFINE_bool( + enable_inplace_whitelist, false, + "If this option turns on, only these op in whitelist can be inplaced." + "If it turns off, all of the running op can be candidate of inplaced op." + "Such as scale, elementwise_add" + "By default, it's turned on"); + +DECLARE_string(memory_optimize_debug); + +// clang-format off +const std::string kInplacedOpWhiteList[] = { // NOLINT + "sigmoid", + "exp", + "relu", + "tanh", + "sqrt", + "ceil", + "floor", + "reciprocal", + "relu6", + "soft_relu", + "hard_sigmoid", + "batch_norm", + "batch_norm_grad", + "sum", + "sum_grad", + "scale", + "reshape", + "elementwise_add", + "elementwise_add_grad", +}; +// clang-format on + +namespace paddle { +namespace framework { +namespace details { + +static inline ir::Node* GetNextCascadeInplacedVar(ir::Node* var) { + // if next op is inplaced, then return the output var + // otherwise return nullptr + PADDLE_ENFORCE(var && var->IsVar() && !var->IsCtrlVar()); + ir::Node* inplaced_var = nullptr; + for (auto* next_op : var->outputs) { + for (auto* output : next_op->outputs) { + if (output->IsVar() && !output->IsCtrlVar() && + output->Name() == var->Name()) { + inplaced_var = output; + } + } + } + return inplaced_var; +} + +static inline ir::Node* GetPrevCascadeInplacedVar(ir::Node* var) { + PADDLE_ENFORCE(var && var->IsVar() && !var->IsCtrlVar()); + if (var->inputs.empty()) return nullptr; + auto* prev_op = var->inputs.at(0); + auto input_it = std::find_if(prev_op->inputs.begin(), prev_op->inputs.end(), + [&](ir::Node* node) { + if (node->IsVar() && !node->IsCtrlVar() && + node->Name() == var->Name()) { + return true; + } else { + return false; + } + }); + return input_it == prev_op->inputs.end() ? nullptr : *input_it; +} + +InplacePass::InplacePass() : Pass() { + if (FLAGS_enable_inplace_whitelist) { + for (auto& s : kInplacedOpWhiteList) { + whitelist_.emplace(s); + } + } +} + +void InplacePass::InitSSAGraphNodes() const { + std::unordered_map> all_vars; + for (auto* op : view_.AllOps()) { + for (auto* node : op->inputs) { + if (!node->IsVar() || node->IsCtrlVar()) continue; + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + for (auto* node : op->outputs) { + if (!node->IsVar() || node->IsCtrlVar()) continue; + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + } +} + +std::unique_ptr InplacePass::ApplyImpl( + std::unique_ptr graph) const { + var_nodes_.clear(); + view_.Build(graph.get()); + InitSSAGraphNodes(); + + for (auto* op : view_.AllOps()) { + if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name())) + continue; + TryInplaceOpInputOutput(op, graph.get()); + } + graph->ResolveHazard(var_nodes_); + + return graph; +} + +void InplacePass::InplaceModifyDesc(const std::string& var, + const std::string& cache_var, + const size_t& idx) const { + for (size_t i = idx; i < view_.AllOps().size(); ++i) { + ir::Node* op = view_.AllOps()[i]; + PADDLE_ENFORCE(op->IsOp() && op->Op()); + auto* op_desc = op->Op(); + op_desc->RenameInput(var, cache_var); + op_desc->RenameOutput(var, cache_var); + if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var); + op_desc->Flush(); + } +} + +const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var, + const std::string& cache_var, + const size_t& idx, + ir::Graph* graph) const { + PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && + var_nodes_[var].at(0)->Var() != nullptr); + std::unique_ptr var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); + var_desc->SetName(cache_var); + + SSANodePair swap_nodes; + + for (size_t i = idx; i < view_.AllOps().size(); ++i) { + auto* op = view_.AllOps()[i]; + + // redirect the input to the latest version of cache_var + for (auto* node : op->inputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + + // swap node to cache_node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp()); + auto* prev_op = node->inputs[0]; + std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, + cache_node); + cache_node->inputs.emplace_back(prev_op); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + + swap_nodes.emplace_back(std::make_pair(node, cache_node)); + } + } + + // if we need to rename the output, + // always create a newer version of cache_var + for (auto* node : op->outputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + // swap node to cache node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + cache_node->inputs.emplace_back(op); + std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + + swap_nodes.emplace_back(std::make_pair(node, cache_node)); + } + } + } + + return swap_nodes; +} + +void InplacePass::CommitModify(const SSANodePair& swap_nodes, + ir::Graph* graph) const { + for (auto& pair : swap_nodes) { + auto *node = pair.first, *cache_node = pair.second; + const std::string var = node->Name(), cache_var = cache_node->Name(); + var_nodes_[cache_var].emplace_back(cache_node); + graph->RemoveNode(node); + auto& nodes = var_nodes_.at(var); + // release unused var in graph. Because python side memory optimize + // may reused the var in same name, so we only clear the var node + // after current inplaced index. + nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end()); + } +} + +void InplacePass::WithdrawModify(const SSANodePair& nodes, + ir::Graph* graph) const { + for (auto& pair : nodes) { + auto *node = pair.first, *cache_node = pair.second; + const std::string var = node->Name(), cache_var = cache_node->Name(); + auto* prev_op = node->inputs[0]; + std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), cache_node, + node); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), cache_node, + node); + } + graph->RemoveNode(cache_node); + } +} + +void InplacePass::TryInplaceOpInputOutput(ir::Node* op, + ir::Graph* graph) const { + VLOG(4) << "Try to inplace op " << op->Name(); + PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr, + "op_desc is nullptr"); + // some pre-requirments need to meet if the op want to inplaced. + + auto* op_desc = op->Op(); + auto& infer_inplace = + OpInfoMap::Instance().Get(op_desc->Type()).infer_inplace_; + + // 1. infer_inplace_ is registered. + if (!static_cast(infer_inplace)) return; + PADDLE_ENFORCE(static_cast(infer_inplace), + "%s's infer_inplace has not been registered", op_desc->Type()); + + auto* block = op_desc->Block(); + auto in_to_outs = infer_inplace(*op_desc, block); + + auto& all_ops = view_.AllOps(); + auto cursor = std::find(all_ops.begin(), all_ops.end(), op); + size_t idx = std::distance(all_ops.begin(), cursor); + + for (auto& pair : in_to_outs) { + auto& in_var_name = pair.first; + auto& out_var_name = pair.second; + auto* in_node = view_.GetNodeByName(in_var_name, op->inputs); + auto* out_node = view_.GetNodeByName(out_var_name, op->outputs); + + // 2. there is no external pending op on the input node + if (view_.PendingOpsOnVar(in_node).size() > 1) { + VLOG(4) << string::Sprintf( + "Skiped pair %s => %s. %s input has external dependency." + "inplace such pair will overwrite the memory.", + out_var_name, in_var_name, op->Name()); + continue; + } + + // 3. if output has been memory optimize by python(fluid.memory_optmize()). + // this candidate can not be inplaced. Will be deprecated in the future. + if (view_.InSkipSet(out_node->Name())) { + VLOG(4) << string::Sprintf( + "Skiped %s => %s reused previous memory block in python memory " + "optmize," + "it inplace may generate a circle", + out_var_name, in_var_name, op->Name()); + continue; + } + + // Debug Interface. Which would be skipped by the pass. + if (out_node->Name() == FLAGS_memory_optimize_debug) { + VLOG(3) << "Skiped var by force. FLAGS_memory_optimize_debug=" + << out_node->Name(); + continue; + } + + // NOTE(dzhwinter): + // two stage commit of inplaced process. if after inplace happens generate a + // circle, + // then withdraw the changes. Otherwise, safely add the node. + auto swap_nodes = + TryInplaceModifyVar(out_var_name, in_var_name, idx, graph); + + if (!ir::HasCircle(*graph)) { + VLOG(3) << string::Sprintf("!!! %s, %s => %s inplaced", op->Name(), + out_var_name, in_var_name); + InplaceModifyDesc(out_var_name, in_var_name, idx); + CommitModify(swap_nodes, graph); + } else { + VLOG(3) << string::Sprintf( + "Skiped pair %s => %s, inplace will generate a circle. withdraw %s", + out_var_name, in_var_name, op->Name()); + WithdrawModify(swap_nodes, graph); + } + } +} + +ir::Node* GraphView::GetNodeByName(const std::string& name, + const std::vector& nodes) const { + // nodes should be op->inputs/outputs + // node in same node do have different name. + std::unordered_set nodes_in_op; + bool has_dup_node = + std::all_of(nodes.begin(), nodes.end(), [&nodes_in_op](ir::Node* node) { + if (!node->IsVar() || node->IsCtrlVar() || node->Var() == nullptr) { + if (nodes_in_op.count(node->Name())) return true; + nodes_in_op.emplace(node->Name()); + } + return false; + }); + PADDLE_ENFORCE(has_dup_node == false, "nodes has same name!"); + ir::Node* node = nullptr; + for (auto* it : nodes) { + if (!it->IsVar() || it->IsCtrlVar() || it->Var() == nullptr) continue; + if (it->Name() == name) { + node = it; + break; + } + } + PADDLE_ENFORCE(node != nullptr, + string::Sprintf("Not found var %s in nodes!", name)); + return node; +} + +std::vector GraphView::PendingOpsOnVar(ir::Node* node) { + // get the pending ops depends on same var node. + // because node also maybe a inplaced variable, so need to backtrack all the + // previous inplaced vars. + std::vector pending_ops; + ir::Node* p = node; + while (p != nullptr) { + pending_ops.insert(pending_ops.end(), p->outputs.begin(), p->outputs.end()); + p = GetPrevCascadeInplacedVar(p); + } + return pending_ops; +} + +void GraphView::Build(ir::Graph* g) { + // track the var nodes in correct order. + // Because we insert some new created node. Which may have data race between + // nodes. + // resolve data harzards depends on the var nodes in right order. + ops_ = SortOpLikeDescOrder(*g); + + // 1. track the nodes which reused previous node in Python memory optimize. + // these node can not be inplaced, otherwise may generate a circle in graph. + std::unordered_set all_vars; + for (auto& node : g->Nodes()) { + if (node->IsVar()) continue; + for (auto& out : node->outputs) { + if (out->IsCtrlVar() || out->Var() == nullptr) continue; + if (all_vars.count(out->Name())) { + dup_nodes_.emplace(out->Name()); + } else { + all_vars.emplace(out->Name()); + } + } + } + + // 2. track the nodes which used by parameter server. + // these node can not be inplaced, otherwise trainer + // pserver can not find each other name. + auto update_skip_set = [&](ir::Node* node) { + for (auto& in : node->inputs) { + if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name()); + } + for (auto& out : node->outputs) { + if (out->IsVar() && out->Var() != nullptr) + dup_nodes_.emplace(out->Name()); + } + }; + for (auto& node : g->Nodes()) { + if (!node->IsOp()) continue; + if (node->Name() == "send") update_skip_set(node); + if (node->Name() == "recv") update_skip_set(node); + if (node->Name() == "prefetch") update_skip_set(node); + } +} + +const std::vector& GraphView::AllOps() { return ops_; } + +bool GraphView::InSkipSet(const std::string& var) const { + return dup_nodes_.count(var); +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(inplace_pass, paddle::framework::details::InplacePass); diff --git a/paddle/fluid/framework/details/inplace_op_pass.h b/paddle/fluid/framework/details/inplace_op_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..1abcf1f279e225839d440ff9c6840ce9b8a6547f --- /dev/null +++ b/paddle/fluid/framework/details/inplace_op_pass.h @@ -0,0 +1,93 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may abtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/memory_optimize_helper.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { + +class GraphView { + public: + GraphView() = default; + + void Build(ir::Graph* g); + + const std::vector& AllOps(); + + ir::Node* GetNodeByName(const std::string& name, + const std::vector& nodes) const; + + std::vector PendingOpsOnVar(ir::Node* var); + + // Will Deperated in the future. + // NOTE(dzhwinter) : + // 1. Python memory optimize will reuse + // memory based var name, so different op output may + // have the same variable name. enable inplace on such node + // will generate a circle in ssa graph. + // 2. DistributeTranspiler will use unique name to + // map the parameter and gradient, must be skipped. + bool InSkipSet(const std::string& var) const; + + private: + std::vector ops_; + std::unordered_set dup_nodes_; // mem opt affect nodes + std::map> adj_list_; +}; + +typedef std::vector> SSANodePair; +class InplacePass : public ir::Pass { + public: + InplacePass(); + + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; + + void InitSSAGraphNodes() const; + + private: + const SSANodePair TryInplaceModifyVar(const std::string& var, + const std::string& cache_var, + const size_t& idx, + ir::Graph* graph) const; + + void CommitModify(const SSANodePair&, ir::Graph* graph) const; + + void WithdrawModify(const SSANodePair& nodes, ir::Graph* graph) const; + + void InplaceModifyDesc(const std::string& in_var, const std::string& out_var, + const size_t& idx) const; + + void TryInplaceOpInputOutput(ir::Node* op, ir::Graph* graph) const; + + mutable std::map> var_nodes_; + + mutable std::unordered_set whitelist_; + mutable GraphView view_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.cc b/paddle/fluid/framework/details/memory_early_delete_pass.cc index 5906b7d57ce122520a4594f1528e00982eaa1a7f..69f8f705484450b0544291b19027eb174d7eeb8f 100644 --- a/paddle/fluid/framework/details/memory_early_delete_pass.cc +++ b/paddle/fluid/framework/details/memory_early_delete_pass.cc @@ -16,7 +16,7 @@ #include #include #include -#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" diff --git a/paddle/fluid/framework/details/memory_reuse_types.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc similarity index 66% rename from paddle/fluid/framework/details/memory_reuse_types.cc rename to paddle/fluid/framework/details/memory_optimize_helper.cc index 2b9ff518b9adcd366cc877998400a8bdc05fa033..b56ef021ef508a43aac082acbcfa6f543635203e 100644 --- a/paddle/fluid/framework/details/memory_reuse_types.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper.cc @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" +#include #include +#include #include #include @@ -21,15 +23,17 @@ namespace paddle { namespace framework { namespace details { +size_t NodeSizeInBytes(const VarDesc& node) { + auto shape = node.GetShape(); + int size = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); + size_t type_size = SizeOfType(node.GetDataType()); + return type_size * std::abs(size); +} + size_t NodeSizeInBytes(ir::Node* n) { auto* desc = FindVarDescInBlock(n); - auto shape = desc->GetShape(); - size_t type_size = SizeOfType(desc->GetDataType()); - int size = 1; - for (auto& s : shape) { - size *= s; - } - return type_size * std::abs(size); + return NodeSizeInBytes(*desc); } std::string DebugStringImpl(VarDesc* var) { @@ -83,7 +87,7 @@ struct NodeComparator { } }; -void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) { +void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); PADDLE_ENFORCE(op->IsOp()); if (mark_table_.count(var->Name()) != 0) { @@ -119,11 +123,11 @@ void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) { mark_table_[var->Name()] = it; } -int OrderedNodePairPool::GetIndex(ir::Node* var) { +int OrderedNodeList::GetIndex(ir::Node* var) { return std::distance(nodes_.begin(), mark_table_[var->Name()]); } -ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const { +ir::Node* OrderedNodeList::NodeMatch(ir::Node* var) const { ir::Node* found_node = nullptr; NodeComparator compare_node; @@ -136,13 +140,15 @@ ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const { return found_node; } -void OrderedNodePairPool::Erase(ir::Node* var) { - PADDLE_ENFORCE(mark_table_.count(var->Name())); - nodes_.erase(mark_table_[var->Name()]); - mark_table_.erase(var->Name()); +void OrderedNodeList::Erase(ir::Node* var) { Erase(var->Name()); } + +void OrderedNodeList::Erase(const std::string& var) { + PADDLE_ENFORCE(mark_table_.count(var)); + nodes_.erase(mark_table_[var]); + mark_table_.erase(var); } -std::string OrderedNodePairPool::ToString() const { +std::string OrderedNodeList::ToString() const { std::stringstream ss; for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { ss << DebugString(it->first) << " "; @@ -150,6 +156,43 @@ std::string OrderedNodePairPool::ToString() const { return ss.str(); } +bool NodeCanReused(ir::Node* node) { + if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false; + // auto* desc = node->Var(); + bool flag = NodeCanReused(*node->Var()); + for (auto* op : node->inputs) { + if (op->Op()->HasAttr("force_cpu")) { + // op output force generated in cpu, can not be reused. + flag &= framework::AttrReader(op->Op()->GetAttrMap()) + .Get("force_cpu") == 0; + } + } + return flag; +} + +bool NodeCanReused(const VarDesc& node) { + auto type = node.GetType(); + if (node.Persistable() || type != proto::VarType::LOD_TENSOR || + node.GetShape().empty()) { + return false; + } + // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad + std::string name = node.Name(); + if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') + return false; + return true; +} + +bool OpHasSubBlock(OpDesc* desc) { + const AttributeMap& attrs = desc->GetAttrMap(); + for (auto& attr : attrs) { + if (attr.second.type() == typeid(BlockDesc*) || // NOLINT + attr.second.type() == typeid(std::vector)) // NOLINT + return true; + } + return false; +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types.h b/paddle/fluid/framework/details/memory_optimize_helper.h similarity index 69% rename from paddle/fluid/framework/details/memory_reuse_types.h rename to paddle/fluid/framework/details/memory_optimize_helper.h index 9a9c1d948e869016717fea9ff6b8236adfc29845..064183d61ea7386b6b45034c90fd7569a8647f60 100644 --- a/paddle/fluid/framework/details/memory_reuse_types.h +++ b/paddle/fluid/framework/details/memory_optimize_helper.h @@ -43,7 +43,7 @@ using GraphNodePool = std::vector< // For example, // node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. // O(1) insert, delete -class OrderedNodePairPool { +class OrderedNodeList { public: using NodePair = std::pair>; using Iter = typename std::list::iterator; @@ -53,8 +53,12 @@ class OrderedNodePairPool { void Erase(ir::Node* var); + void Erase(const std::string& var); + bool Has(ir::Node* var) { return mark_table_.count(var->Name()); } + bool Has(const std::string& var) { return mark_table_.count(var); } + ir::Node* NodeMatch(ir::Node* var) const; // map store non-const iterator, can not promise const int GetIndex(ir::Node* var); @@ -67,6 +71,11 @@ class OrderedNodePairPool { ConstIter end() const { return nodes_.end(); } size_t size() const { return nodes_.size(); } + void Clear() { + mark_table_.clear(); + nodes_.clear(); + } + private: // for searching. std::unordered_map mark_table_; @@ -74,14 +83,53 @@ class OrderedNodePairPool { std::list nodes_; }; +// valid a tensor can be reuse or not +bool NodeCanReused(ir::Node* node); + +// valid a tensor can be reuse or not. +bool NodeCanReused(const VarDesc& node); + +// check op has subblock or not +bool OpHasSubBlock(OpDesc* desc); + // node memory size in bytes size_t NodeSizeInBytes(ir::Node* n); +// node memory size in bytes +size_t NodeSizeInBytes(const VarDesc&); + std::string DebugString(ir::Node* var); -// std::string DebugString(VarDesc* var); VarDesc* FindVarDescInBlock(ir::Node* n); +template +class FilterVariableImpl { + public: + void operator()(const Container& nodes, Callback callback) { + for (auto* node : nodes) { + callback(node); + } + } +}; + +// filter var node for op->inputs/outputs +template +class FilterVariableImpl, Callback> { + public: + void operator()(const std::vector& nodes, Callback callback) { + for (auto* var : nodes) { + if (var->IsVar() && !var->IsCtrlVar()) { + callback(var); + } + } + } +}; + +template +void FilterVariables(const Container& nodes, Callback callback) { + FilterVariableImpl()(nodes, callback); +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types_test.cc b/paddle/fluid/framework/details/memory_optimize_helper_test.cc similarity index 96% rename from paddle/fluid/framework/details/memory_reuse_types_test.cc rename to paddle/fluid/framework/details/memory_optimize_helper_test.cc index d2fabf5ce068e0f752b86c0d02b971f18fc65f01..f2b9baf14a34ace9cc860797280dbd519dfa4f2a 100644 --- a/paddle/fluid/framework/details/memory_reuse_types_test.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include #include #include @@ -27,8 +27,8 @@ namespace paddle { namespace framework { namespace details { -TEST(OrderedNodePairPool, Normal) { - OrderedNodePairPool pool; +TEST(OrderedNodeList, Normal) { + OrderedNodeList pool; std::vector> nodes; // clang-format off diff --git a/paddle/fluid/framework/details/analysis_var_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc similarity index 78% rename from paddle/fluid/framework/details/analysis_var_pass.cc rename to paddle/fluid/framework/details/memory_optimize_pass.cc index 223b9da3cfba33fc32d1334cddccb9f503bd0bef..85de14a60a8fe6958794f0ac25768b9da1943f9d 100644 --- a/paddle/fluid/framework/details/analysis_var_pass.cc +++ b/paddle/fluid/framework/details/memory_optimize_pass.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/analysis_var_pass.h" +#include "paddle/fluid/framework/details/memory_optimize_pass.h" #include #include #include @@ -48,39 +48,10 @@ static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { op1->Outputs() == op2->Outputs(); } -template -class FilterVariableImpl { - public: - void operator()(const Container& nodes, Callback callback) { - for (auto* node : nodes) { - callback(node); - } - } -}; - -// filter var node for op->inputs/outputs -template -class FilterVariableImpl, Callback> { - public: - void operator()(const std::vector& nodes, Callback callback) { - for (auto* var : nodes) { - if (var->IsVar() && !var->IsCtrlVar()) { - callback(var); - } - } - } -}; - -template -void FilterVariables(const Container& nodes, Callback callback) { - FilterVariableImpl()(nodes, callback); -} - -std::unique_ptr AnalysisVarPass::ApplyImpl( +std::unique_ptr MemoryOptimizePass::ApplyImpl( std::unique_ptr graph) const { auto nodes = graph->Nodes(); - auto subblock_vars = GetSubBlockVars(nodes); - skip_set_.insert(subblock_vars.begin(), subblock_vars.end()); + CollectSkipVarsSet(nodes); cfg_.reset(new details::ControlFlowGraph(*graph)); cfg_->LiveVariableAnalysis(); @@ -103,48 +74,53 @@ std::unique_ptr AnalysisVarPass::ApplyImpl( } for (auto& var : op->outputs) { - if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) { - ir::Node* cache = pool_.NodeMatch(var); - if (var->Name() == FLAGS_memory_optimize_debug) { - VLOG(3) << "start match var " << DebugString(var) << " of op " - << op->Name(); - VLOG(3) << pool_.ToString(); - VLOG(3) << "matched in pool : " - << ((cache == nullptr) ? "False" : "True"); - } - if (cache != nullptr) { - if (var->Name() == cache->Name()) { - VLOG(3) << "The same cache variable is cascade reused." - << var->Name() << " is re-filled to the pool after" - << "the reused op is finished. Current op can not " - << "replace it again. Skip this candidate."; - continue; - } + if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 || + skip_set_.count(var->Name())) + continue; + ir::Node* cache = pool_.NodeMatch(var); + + if (var->Name() == FLAGS_memory_optimize_debug) { + VLOG(3) << "start match var " << DebugString(var) << " of op " + << op->Name(); + VLOG(3) << pool_.ToString(); + VLOG(3) << "matched in pool : " + << ((cache == nullptr) ? "False" : "True"); + } - int node_idx_in_pool = pool_.GetIndex(cache); - VLOG(3) << string::Sprintf( - "!!! %s, %s => %s, cache idx %d, pool size %d", - std::to_string(reuse_id++), DebugString(var), DebugString(cache), - node_idx_in_pool, static_cast(pool_.size())); - // update CFG Graph on the fly. - // reused var maybe re-fill into the pool - cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); - // NOTE(dzhwinter): we need to both update the ProgramDesc - // and IR Graph. because op_desc/var_desc is used in CreateOp, - // CreateVar when running happens. But IR Graph - // define the dependence relationship between nodes. - RenameVarInGraphDesc(var->Name(), cache->Name(), idx); - RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get()); - - pool_.Erase(cache); + if (cache == nullptr) continue; + if (var->Name() == cache->Name()) { + VLOG(3) << "The same cache variable is cascade reused." << var->Name() + << " is re-filled to the pool after" + << "the reused op is finished. Current op can not " + << "replace it again. Skip this candidate."; + continue; + + int node_idx_in_pool = pool_.GetIndex(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(reuse_id++), DebugString(var), DebugString(cache), + node_idx_in_pool, static_cast(pool_.size())); + // update CFG Graph on the fly. + // reused var maybe re-fill into the pool + cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); + // NOTE(dzhwinter): we need to both update the ProgramDesc + // and IR Graph. because op_desc/var_desc is used in CreateOp, + // CreateVar when running happens. But IR Graph + // define the dependence relationship between nodes. + RenameVarInGraphDesc(var->Name(), cache->Name(), idx); + RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get()); + + pool_.Erase(cache); + } + // fill the pool + std::unordered_set unlived_vars; + for (auto var : cfg_->LiveIn(op)) { + if (cfg_->LiveOut(op).count(var) == 0) { + unlived_vars.emplace(var); } } - } - // fill the pool - for (auto var : cfg_->LiveIn(op)) { - if (cfg_->LiveOut(op).count(var) == 0) { + for (auto var : unlived_vars) { ir::Node* var_node = cfg_->GetNodeFromVarName(var, op); - if (var_node == nullptr) continue; if (NodeCanReused(var_node) && !pool_.Has(var_node)) { pool_.Insert(var_node, op); } @@ -177,7 +153,7 @@ std::unique_ptr AnalysisVarPass::ApplyImpl( return graph; } -void AnalysisVarPass::SubGraphOptimize(OpDesc* op_desc) const { +void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { // conditional block, while op and their grad op auto* sub_block_desc = AttrReader(op_desc->GetAttrMap()).Get("sub_block"); @@ -247,25 +223,32 @@ void AnalysisVarPass::SubGraphOptimize(OpDesc* op_desc) const { } } -std::unordered_set AnalysisVarPass::GetSubBlockVars( +void MemoryOptimizePass::CollectSkipVarsSet( const std::unordered_set& nodes) const { - std::unordered_set vars; + auto update_skip_set = [&](OpDesc* op_desc) { + auto inputs = op_desc->InputArgumentNames(); + auto outputs = op_desc->OutputArgumentNames(); + skip_set_.insert(inputs.begin(), inputs.end()); + skip_set_.insert(outputs.begin(), outputs.end()); + }; for (auto& op : nodes) { if (!op->IsOp() || op->Op() == nullptr) continue; auto* op_desc = op->Op(); - if (OpHasSubBlock(op_desc)) { - auto inputs = op_desc->InputArgumentNames(); - auto outputs = op_desc->OutputArgumentNames(); - vars.insert(inputs.begin(), inputs.end()); - vars.insert(outputs.begin(), outputs.end()); - } + // NOTE(dzhwinter): + // current block can not reuse next level block vars. + if (OpHasSubBlock(op_desc)) update_skip_set(op_desc); + // NOTE(dzhwinter): + // distributed ops input/output name need to + // keep same bettwen trainer/pserver + if (op_desc->Type() == "send") update_skip_set(op_desc); + if (op_desc->Type() == "recv") update_skip_set(op_desc); + if (op_desc->Type() == "prefetch") update_skip_set(op_desc); } - return vars; } -void AnalysisVarPass::RenameVarInGraphDesc(const std::string& var, - const std::string& cache_var, - size_t idx) const { +void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, + const std::string& cache_var, + size_t idx) const { for (size_t i = idx; i < cfg_->Ops().size(); ++i) { auto* op = cfg_->Ops()[i]; PADDLE_ENFORCE(op->IsOp() && op->Op()); @@ -277,7 +260,7 @@ void AnalysisVarPass::RenameVarInGraphDesc(const std::string& var, } } -void AnalysisVarPass::InitSSAGraphNodes() const { +void MemoryOptimizePass::InitSSAGraphNodes() const { std::unordered_map> all_vars; if (var_nodes_.empty()) { for (auto* op : cfg_->Ops()) { @@ -297,9 +280,10 @@ void AnalysisVarPass::InitSSAGraphNodes() const { } } -void AnalysisVarPass::RenameVarInGraphNode(const std::string& var, - const std::string& cache_var, - size_t idx, ir::Graph* graph) const { +void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, + const std::string& cache_var, + size_t idx, + ir::Graph* graph) const { // if replace happens, we need to create a newer version cache_var // but use the same dims/data_type with var. PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && @@ -358,39 +342,6 @@ void AnalysisVarPass::RenameVarInGraphNode(const std::string& var, var_nodes_.at(var).clear(); } -bool AnalysisVarPass::NodeCanReused(ir::Node* node) const { - if (!node->IsVar() || node->IsCtrlVar()) return false; - auto* desc = node->Var(); - auto type = desc->GetType(); - if (desc->Persistable() || type != proto::VarType::LOD_TENSOR || - desc->GetShape().empty()) { - return false; - } - // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad - std::string name = node->Name(); - if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') - return false; - if (skip_set_.count(name)) return false; - for (auto* op : node->inputs) { - if (op->Op()->HasAttr("force_cpu")) { - // op output force generated in cpu, can not be reused. - return framework::AttrReader(op->Op()->GetAttrMap()) - .Get("force_cpu") == 0; - } - } - return true; -} - -bool AnalysisVarPass::OpHasSubBlock(OpDesc* desc) const { - const AttributeMap& attrs = desc->GetAttrMap(); - for (auto& attr : attrs) { - if (attr.second.type() == typeid(BlockDesc*) || // NOLINT - attr.second.type() == typeid(std::vector)) // NOLINT - return true; - } - return false; -} - std::vector SortOpLikeDescOrder(const ir::Graph& graph) { PADDLE_ENFORCE(graph.Has(kAllOpDescs), "Graph has no attribute of kAllOpDescs."); @@ -651,6 +602,7 @@ ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name, } // namespace framework } // namespace paddle -REGISTER_PASS(analysis_var_pass, paddle::framework::details::AnalysisVarPass) +REGISTER_PASS(memory_optimize_pass, + paddle::framework::details::MemoryOptimizePass) .RequireGraphAttr(paddle::framework::details::kGraphNodePool) .RequireGraphAttr(paddle::framework::details::kAllOpDescs); diff --git a/paddle/fluid/framework/details/analysis_var_pass.h b/paddle/fluid/framework/details/memory_optimize_pass.h similarity index 85% rename from paddle/fluid/framework/details/analysis_var_pass.h rename to paddle/fluid/framework/details/memory_optimize_pass.h index 144204beafb341351172c29e3b4cd41db49be6f9..3d6b1897f3b5106054b8f647f9cf613ebd1d65ff 100644 --- a/paddle/fluid/framework/details/analysis_var_pass.h +++ b/paddle/fluid/framework/details/memory_optimize_pass.h @@ -25,7 +25,7 @@ #include #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" @@ -35,12 +35,10 @@ namespace details { constexpr char kAllOpDescs[] = "all_op_descs"; std::vector SortOpLikeDescOrder(const ir::Graph& graph); -// sort op in bfs order -std::vector BFSSortGraphOps(const ir::Graph& graph); class ControlFlowGraph; -class AnalysisVarPass : public ir::Pass { +class MemoryOptimizePass : public ir::Pass { protected: std::unique_ptr ApplyImpl( std::unique_ptr graph) const override; @@ -57,17 +55,14 @@ class AnalysisVarPass : public ir::Pass { ir::Graph* graph) const; void SubGraphOptimize(OpDesc* op_desc) const; - // valid a tensor can be reuse or not - bool NodeCanReused(ir::Node* node) const; - // scan subblock and collect the output/input variables. - std::unordered_set GetSubBlockVars( - const std::unordered_set&) const; - // check op has subblock or not - bool OpHasSubBlock(OpDesc* desc) const; + // 1. scan op with subblock and collect the output/input vars. + // while, while_grad, conditional_block + // 2. scan distributed ops and collect the output/input vars + void CollectSkipVarsSet(const std::unordered_set&) const; private: // Reuse Node Pool, Owned. - mutable OrderedNodePairPool pool_; + mutable OrderedNodeList pool_; // controlflow Graph mutable std::unique_ptr cfg_; // skip set diff --git a/paddle/fluid/framework/details/analysis_var_pass_test.cc b/paddle/fluid/framework/details/memory_optimize_pass_test.cc similarity index 90% rename from paddle/fluid/framework/details/analysis_var_pass_test.cc rename to paddle/fluid/framework/details/memory_optimize_pass_test.cc index 9bc4fd33f7058949ca60983ea666a21cb4877b3e..3d3dfa93594d496431f7cb60dceb26f20250fc16 100644 --- a/paddle/fluid/framework/details/analysis_var_pass_test.cc +++ b/paddle/fluid/framework/details/memory_optimize_pass_test.cc @@ -12,63 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/analysis_var_pass.h" +#include "paddle/fluid/framework/details/memory_optimize_pass.h" #include #include #include #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/fluid/framework/details/graph_test_base.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -namespace paddle { -namespace framework { - -class DummyOp : public OperatorBase { - public: - DummyOp(const std::string& type, const VariableNameMap& inputs, - const VariableNameMap& outputs, const AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const Scope& scope, - const platform::Place& place) const override {} -}; - -class SumOpMaker : public OpProtoAndCheckerMaker { - public: - void Make() { - AddInput("X", "").AsDuplicable(); - AddOutput("Out", ""); - AddComment(""); - } -}; - -class AssignOpMaker : public OpProtoAndCheckerMaker { - public: - void Make() { - AddInput("X", "").AsDuplicable(); - AddOutput("Out", ""); - AddComment(""); - } -}; - -class DummyVarTypeInference : public VarTypeInference { - public: - void operator()(const OpDesc& op_desc, BlockDesc* block) const override { - auto& inputs = op_desc.Input("X"); - auto type = block->Var(inputs.front())->GetType(); - auto out_var_name = op_desc.Output("Out").front(); - block->Var(out_var_name)->SetType(type); - } -}; - -} // namespace framework -} // namespace paddle - REGISTER_OPERATOR(sum, paddle::framework::DummyOp, paddle::framework::SumOpMaker, paddle::framework::DummyVarTypeInference); @@ -141,15 +97,6 @@ inline static ProgramDesc FillProgramDesc() { return prog; } -template -inline static std::string DebugString(const Container& c) { - std::stringstream ss; - for (auto& item : c) { - ss << item << " "; - } - return ss.str(); -} - TEST(CFGGraph, IRGraph) { // prepare ir graph auto prog = FillProgramDesc(); diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h index eea7e712f8f6e187cdceedce77cc76d1d4ca2101..0901e59f9786b43361e7a570f8c2a07be54c1ac2 100644 --- a/paddle/fluid/framework/details/op_registry.h +++ b/paddle/fluid/framework/details/op_registry.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/grad_op_desc_maker.h" +#include "paddle/fluid/framework/inplace_op_inference.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" @@ -32,7 +33,8 @@ enum OpInfoFillType { kOpProtoAndCheckerMaker = 1, kGradOpDescMaker = 2, kVarTypeInference = 3, - kShapeInference = 4 + kShapeInference = 4, + kInplaceOpInference = 5 }; template @@ -48,8 +50,11 @@ struct OpInfoFillTypeID { ? kVarTypeInference : (std::is_base_of::value ? kShapeInference - : static_cast( - -1))))); + : (std::is_base_of< + InplaceOpInference, T>::value + ? kInplaceOpInference + : static_cast( + -1)))))); } }; @@ -139,6 +144,16 @@ struct OpInfoFiller { } }; +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->infer_inplace_ = [](const OpDesc& op_desc, BlockDesc* block) { + T infer; + return infer(op_desc, block); + }; + } +}; + } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h new file mode 100644 index 0000000000000000000000000000000000000000..03ab2a2b6c5dc07805fddddc3ac53f61e7b6a697 --- /dev/null +++ b/paddle/fluid/framework/inplace_op_inference.h @@ -0,0 +1,115 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/type_defs.h" + +namespace paddle { +namespace framework { + +/* + Inplace Inference for create In->Out pairs for inplaced operator. + If we specify a pair of corresponding names. For example, X->Out. + then Out will inplaced use X's memory. The base class will do + legality validation for both variables. +*/ +class InplaceOpInference { + public: + virtual ~InplaceOpInference() {} + virtual std::unordered_map operator()( + const OpDesc& op_desc, BlockDesc* block) const = 0; +}; + +class InplaceInToOut : public InplaceOpInference { + public: + std::unordered_map operator()( + const OpDesc& op_desc, BlockDesc* block) const { + std::unordered_map ret; + auto in_out_var_names_pair = this->Apply(op_desc, block); + for (auto& pair : in_out_var_names_pair) { + PADDLE_ENFORCE(!op_desc.Input(pair.first).empty(), + string::Sprintf("op %s do not have input of %s!", + op_desc.Type(), pair.first)); + PADDLE_ENFORCE(!op_desc.Output(pair.second).empty(), + string::Sprintf("op %s do not have output of %s!", + op_desc.Type(), pair.second)); + auto& in_name = op_desc.Input(pair.first).at(0); + auto& out_name = op_desc.Output(pair.second).at(0); + + auto in = block->FindRecursiveOrCreateVar(in_name); + auto out = block->FindRecursiveOrCreateVar(out_name); + if (TryInplaceInputOutput(in, out)) ret.insert({in_name, out_name}); + } + return ret; + } + + protected: + virtual std::unordered_map Apply( + const OpDesc& op_desc, BlockDesc* block) const = 0; + + bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const { + return in.Name() != out.Name() && details::NodeCanReused(in) && + details::NodeCanReused(out) && + details::NodeSizeInBytes(out) <= details::NodeSizeInBytes(in); + } +}; + +/* + Inplace In and Out for operator only have an Input and an Output. + For example, activation op. + */ +class SingleOpInplaceInToOut : public InplaceInToOut { + protected: + std::unordered_map Apply( + const OpDesc& op_desc, BlockDesc* block) const override { + PADDLE_ENFORCE(!op_desc.InputNames().empty(), + "Op inputs must not be empty"); + PADDLE_ENFORCE(!op_desc.OutputNames().empty(), + "Op outputs must not be empty"); + auto x_name = op_desc.InputNames().at(0); + auto out_name = op_desc.OutputNames().at(0); + return std::unordered_map{{x_name, out_name}}; + } +}; + +/* + Gradient op. Inplace output use it's Input. + For example, Input@Grad->Input reuse strategy. + */ +class GradOpInplaceInToOut : public InplaceInToOut { + protected: + std::unordered_map Apply( + const OpDesc& op_desc, BlockDesc* block) const override { + std::unordered_map ret; + std::unordered_set output_names(op_desc.OutputNames().begin(), + op_desc.OutputNames().end()); + for (auto& input_name : op_desc.InputNames()) { + if (output_names.count(GradVarName(input_name))) { + ret.insert({input_name, GradVarName(input_name)}); + } + } + return ret; + } +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3e4d715c6f089496d1b1f7906e3f10147a073622 --- /dev/null +++ b/paddle/fluid/framework/inplace_op_inference_test.cc @@ -0,0 +1,288 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace framework { + +class NOP : public OperatorBase { + public: + NOP(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const Scope& scope, + const platform::Place& place) const override {} +}; + +class SingleOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class SingleGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("single_op_grad"); + op->SetInput("Out", OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return std::unique_ptr(op); + } +}; + +class SingleOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + ctx->HasInput("X"); + ctx->HasOutput("Out"); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + } +}; + +class SingleGradOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + ctx->HasInput(framework::GradVarName("Out")); + ctx->HasOutput(framework::GradVarName("X")); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out")); + } +}; + +class MultiOutOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddInput("Y", "").AsDuplicable(); + AddInput("Z", "").AsDuplicable(); + AddOutput("Out", ""); + AddOutput("YOut", ""); + AddOutput("ZOut", ""); + AddOutput("NotReuseOut", ""); + AddComment(""); + } +}; + +class MultiOutShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + ctx->ShareDim("X", "Out"); + ctx->ShareDim("Y", "YOut"); + ctx->ShareDim("Z", "ZOut"); + } +}; + +class MultiGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("multi_out_grad"); + op->SetInput("X", Input("X")); + op->SetOutput(framework::GradVarName("Y"), OutputGrad("YOut")); + op->SetOutput(framework::GradVarName("X"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("Z"), OutputGrad("ZOut")); + return std::unique_ptr(op); + } +}; + +class MultiOutGradShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim(framework::GradVarName("Y"), + ctx->GetInputDim(framework::GradVarName("YOut"))); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); + ctx->SetOutputDim(framework::GradVarName("Z"), + ctx->GetInputDim(framework::GradVarName("ZOut"))); + } +}; + +class MultiOutInplaceInToOut : public framework::InplaceInToOut { + public: + using framework::InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const OpDesc& op_desc, BlockDesc* block) const override { + return std::unordered_map{ + {"X", "Out"}, {"Y", "YOut"}, {"Z", "ZOut"}, + }; + } +}; + +class MultiOutGradInplaceInToOut : public framework::InplaceInToOut { + public: + using framework::InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const OpDesc& op_desc, BlockDesc* block) const override { + return std::unordered_map{ + {framework::GradVarName("YOut"), framework::GradVarName("Y")}, + {framework::GradVarName("Out"), framework::GradVarName("X")}, + {framework::GradVarName("ZOut"), framework::GradVarName("Z")}, + }; + } +}; + +} // namespace framework +} // namespace paddle + +namespace f = paddle::framework; +REGISTER_OPERATOR(single_op, f::NOP, f::SingleOpMaker, f::SingleGradOpMaker, + f::SingleOpInplaceInToOut, f::SingleOpShapeInference); +REGISTER_OPERATOR(single_op_grad, f::NOP, f::SingleOpInplaceInToOut, + f::SingleGradOpShapeInference); +REGISTER_OPERATOR(multi_out_op, f::NOP, f::MultiOutOpMaker, f::MultiGradOpMaker, + f::MultiOutInplaceInToOut, f::MultiOutShapeInference); +REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut, + f::MultiOutGradShapeInference); + +namespace paddle { +namespace framework { + +TEST(InferInplace, SingleOpInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("single_op"); + op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); + op->SetOutput("Out", {"test2_out"}); + + prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64}); + prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_out"); + prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); + + auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; + auto in_to_outs = infer_inplace(*op, op->Block()); + EXPECT_EQ(in_to_outs.size(), 1ul); + auto it = in_to_outs.begin(); + EXPECT_EQ(it->first, "test2_a"); + EXPECT_EQ(it->second, "test2_out"); +} + +TEST(InferInplace, SingleGradOpInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("single_op_grad"); + op->SetInput(GradVarName("Out"), {"test2_out"}); + op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"}); + + prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_out"); + prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); + + auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; + auto in_to_outs = infer_inplace(*op, op->Block()); + EXPECT_EQ(in_to_outs.size(), 1ul); + auto it = in_to_outs.begin(); + EXPECT_EQ(it->first, "test2_out"); + EXPECT_EQ(it->second, "test2_a"); +} + +TEST(InferInplace, MultiOutInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("multi_out_op"); + op->SetInput("X", {"a0", "a1"}); + op->SetInput("Y", {"b0"}); + op->SetInput("Z", {"c0", "c1"}); + op->SetOutput("Out", {"o0"}); + op->SetOutput("YOut", {"y0"}); + op->SetOutput("ZOut", {"z0"}); + + prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("o0"); + prog.MutableBlock(0)->Var("y0"); + prog.MutableBlock(0)->Var("z0"); + prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); + + auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; + auto in_to_outs = infer_inplace(*op, op->Block()); + EXPECT_EQ(in_to_outs.size(), 3ul); + std::unordered_map expects = { + {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"}, + }; + EXPECT_TRUE(expects == in_to_outs); +} + +TEST(InferInplace, MultiGradInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("multi_out_grad"); + op->SetInput(GradVarName("Out"), {"o0"}); + op->SetInput(GradVarName("YOut"), {"y0"}); + op->SetInput(GradVarName("ZOut"), {"z0"}); + op->SetOutput(GradVarName("X"), {"a0", "a1"}); + op->SetOutput(GradVarName("Y"), {"b0"}); + op->SetOutput(GradVarName("Z"), {"c0", "c1"}); + + prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("o0"); + prog.MutableBlock(0)->Var("y0"); + prog.MutableBlock(0)->Var("z0"); + prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); + + auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; + auto in_to_outs = infer_inplace(*op, op->Block()); + + EXPECT_EQ(in_to_outs.size(), 3ul); + std::unordered_map expects = { + {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, + }; + EXPECT_TRUE(expects == in_to_outs); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index b118dccd1b3de881b4791bff6cd331726c8e05da..07c2c970d4de3cecf03e4cf80e60e81e7a9595a8 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -10,8 +10,22 @@ function(pass_library TARGET DEST) set(options "") set(oneValueArgs "") set(multiValueArgs SRCS DEPS) + set(targetPrefix "") + + # Get optional argument + set(extraMacroArgs ${ARGN}) + list(LENGTH extraMacroArgs numExtraMacroArgs) + if(numExtraMacroArgs GREATER 0) + list(GET extraMacroArgs 0 targetPrefix) + endif() + cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS}) + if(targetPrefix) + cc_library(${TARGET} SRCS ${targetPrefix}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS}) + else() + cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS}) + endif() + # add more DEST here, such as train, dist and collect USE_PASS into a file automatically. if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference") message(STATUS "add pass ${TARGET} ${DEST}") @@ -51,6 +65,7 @@ pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference) pass_library(conv_affine_channel_fuse_pass inference) pass_library(transpose_flatten_concat_fuse_pass inference) +pass_library(identity_scale_op_clean_pass base) # There may be many transpose-flatten structures in a model, and the output of # these structures will be used as inputs to the concat Op. This pattern will @@ -62,11 +77,11 @@ foreach (index RANGE 3 6) endforeach() if(WITH_MKLDNN) - pass_library(mkldnn_placement_pass base) - pass_library(depthwise_conv_mkldnn_pass base) - pass_library(conv_bias_mkldnn_fuse_pass inference) - pass_library(conv_relu_mkldnn_fuse_pass inference) - pass_library(conv_elementwise_add_mkldnn_fuse_pass inference) + pass_library(mkldnn_placement_pass base mkldnn) + pass_library(depthwise_conv_mkldnn_pass base mkldnn) + pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn) + pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn) + pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn) endif() cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) @@ -86,7 +101,7 @@ cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framewor cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto) cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) if (WITH_MKLDNN) - cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) - cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) - cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass) + cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) + cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) + cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass) endif () diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 8bb3c27bdd32d07d58913db043569f6a3bf69aeb..b7f7c3d82e0da4d3ca8795487fa52fba0394e365 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -141,7 +141,8 @@ class Graph { ir::Node *CreateControlDepVar() { // TODO(panyx0718): control var name should be really unique. const std::string name = string::Sprintf( - "%s@%llu", ir::Node::kControlDepVarName, node_set_.size()); + "%s@%llu", static_cast(ir::Node::kControlDepVarName), + node_set_.size()); auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable)); x->SetId(num_node_created_++); return x; diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 8de93cf285e4bf34c2d2bf425fa5f3459704b3d6..22d4c0a91cc1638264a8c57aa2841ff4e65a1400 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -52,16 +52,29 @@ bool HasCircleHelper( ir::Node *node, const std::map> &adj_list, std::unordered_set *visited, - std::unordered_set *in_trace) { + std::unordered_set *in_trace, + std::vector> *circles) { if (visited->find(node) == visited->end()) { visited->insert(node); in_trace->insert(node); for (ir::Node *in : adj_list.at(node)) { if (visited->find(in) == visited->end() && - HasCircleHelper(in, adj_list, visited, in_trace)) { + HasCircleHelper(in, adj_list, visited, in_trace, circles)) { return true; } else if (in_trace->find(in) != in_trace->end()) { + if (circles != nullptr) { + std::vector circle; + circle.emplace_back(in); + ir::Node *p = in; + for (auto &adj : adj_list.at(p)) { + if (in_trace->count(adj)) { + circle.emplace_back(adj); + p = adj; + } + } + circles->emplace_back(circle); + } return true; } } @@ -71,11 +84,12 @@ bool HasCircleHelper( } bool HasCircleInternal( - const std::map> &adj_list) { + const std::map> &adj_list, + std::vector> *circles) { std::unordered_set visited; std::unordered_set in_trace; for (auto &adj : adj_list) { - if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace)) { + if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace, circles)) { return true; } } @@ -84,13 +98,18 @@ bool HasCircleInternal( } // namespace bool HasCircle(const Graph &graph) { - return HasCircleInternal(BuildOperationAdjList(graph)); + return HasCircleInternal(BuildOperationAdjList(graph), nullptr); +} + +bool FindCircleSubGraph(const Graph &graph, + std::vector> *circles) { + return HasCircleInternal(BuildOperationAdjList(graph), circles); } std::vector TopologySortOperations(const Graph &graph) { std::map> adj_list = BuildOperationAdjList(graph); - PADDLE_ENFORCE(!HasCircleInternal(adj_list)); + PADDLE_ENFORCE(!HasCircleInternal(adj_list, nullptr)); std::unordered_set visited; std::vector ret; for (auto adj : adj_list) { diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index fba4936f2c5c971f6c63a452ec4480ff091db25c..214de9ec7d85aee6021b18866295777e317aa79d 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -28,6 +28,11 @@ namespace ir { // Test if the graph contains circle. bool HasCircle(const Graph &graph); +// Find All Circles for debugging, +// store all subgraph in circles. +bool FindCircleSubGraph(const Graph &graph, + std::vector> *circles); + size_t GraphNum(const Graph &graph); // Topology Sort the operations in the graph from inputs to outputs. diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc index 260a73ae763bd2cdea9948e4d928377a7c718dda..d8973d5aeda1a2e0650a506b4c916b4346f01e2d 100644 --- a/paddle/fluid/framework/ir/graph_helper_test.cc +++ b/paddle/fluid/framework/ir/graph_helper_test.cc @@ -195,6 +195,17 @@ void BuildTwoGraphs(Graph* g) { // v4->outputs.push_back(o5); } +TEST(GraphHelperTest, Circles) { + ProgramDesc prog; + + Graph g(prog); + BuildCircleGraph(&g); + + std::vector> circles; + ASSERT_TRUE(FindCircleSubGraph(g, &circles)); + ASSERT_EQ(circles.size(), 1UL); +} + TEST(GraphHelperTest, GraphNum) { ProgramDesc prog; diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 6282ced1e47329915bb3626b410e55ad8251071d..9ea0729e1f3339c2f17371ecc8fa51325b9629bb 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -117,11 +117,6 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) { // return false; } } - for (auto &item : pdnodes2nodes_) { - for (auto &n : item.second) { - GetMarkedNodes(const_cast(&graph)).insert(n); - } - } VLOG(3) << pdnodes2nodes_.size() << " nodes marked"; return !pdnodes2nodes_.empty(); diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..3b738aa159ebfd77f00c9e532fbd94542e2097db --- /dev/null +++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/identity_scale_op_clean_pass.h" +#include +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr IdentityScaleOpCleanPass::ApplyImpl( + std::unique_ptr graph) const { + FusePassBase::Init("identity_scale_op_clean", graph.get()); + + // pre_op -> scale_in -> scale_op -> scale_out + // -> + // pre_op -> scale_out + GraphPatternDetector detector; + auto pre_op = detector.mutable_pattern()->NewNode("pre_op")->assert_is_op(); + auto scale_in = detector.mutable_pattern() + ->NewNode("scale_in") + ->assert_is_op_input("scale") + ->AsIntermediate(); + auto scale_op = detector.mutable_pattern() + ->NewNode("scale_fuse") + ->assert_is_op("scale") + ->assert_op_attr("scale", 1.) + ->assert_op_attr("bias", 0.); + auto scale_out = detector.mutable_pattern() + ->NewNode("scale_out") + ->assert_is_op_output("scale"); + + pre_op->LinksTo({scale_in}); + scale_op->LinksFrom({scale_in}).LinksTo({scale_out}); + + GraphPatternDetector::handle_t handler = [&]( + const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + Node* scale_op_var = subgraph.at(scale_op); + Node* scale_in_var = subgraph.at(scale_in); + Node* scale_out_var = subgraph.at(scale_out); + Node* pre_op_var = subgraph.at(pre_op); + // Link pre_op directly to scale_out + const std::string scale_in_name = scale_in_var->Name(); + const std::string scale_out_name = scale_out_var->Name(); + // Remove links in graph + GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var}); + // Modify proto message + auto* pre_op_desc = pre_op_var->Op(); + for (auto& parameter : *pre_op_desc->Proto()->mutable_outputs()) { + auto* arguments = parameter.mutable_arguments(); + auto it = std::find(arguments->begin(), arguments->end(), scale_in_name); + PADDLE_ENFORCE(it != arguments->end()); + *it = scale_out_name; + } + + IR_NODE_LINK_TO(pre_op_var, scale_out_var); + }; + + detector(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(identity_scale_op_clean_pass, + paddle::framework::ir::IdentityScaleOpCleanPass); diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..50a654d82f0e4fb7e8e91c665397716407e6d2a5 --- /dev/null +++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class IdentityScaleOpCleanPass : public FusePassBase { + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + + private: + virtual ~IdentityScaleOpCleanPass() = default; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc similarity index 98% rename from paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc rename to paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc index d4a701e0b173a96d8605dff308fee7007a0ecc0c..5d0b294f6fec5f14dcddb91f8ceffb27fc833d4e 100644 --- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h" +#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h" #include #include #include diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h similarity index 100% rename from paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h rename to paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc similarity index 99% rename from paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc rename to paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index a8029e67e659a269f8492cf6e2f1f09040144283..fb3db81347b102cfa264082b36a2e22ea8c22982 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h" +#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h" #include #include #include diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h similarity index 100% rename from paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h rename to paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc similarity index 98% rename from paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc rename to paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc index 61ba097fd8cb55e25bda1947ea97d53308c55bd3..9ef5c298b8cddfec094e9544dc6da9afdcaf0dab 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc @@ -15,8 +15,8 @@ #include #include -#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/graph_traits.h" +#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc similarity index 97% rename from paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc rename to paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc index e359a3832ee8d549f8c58d63bc1cc6564ecadede..4f4605398a665e63662a64a3a925c32d48f10952 100644 --- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h" +#include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h" #include #include #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h similarity index 100% rename from paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h rename to paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc similarity index 98% rename from paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc rename to paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc index 19248b4dfee1da81d18cd2effac08ba68dde80fb..06d56f6222e4bb9a9969d4ab2d260c97d1ce6c72 100644 --- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h" +#include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h" #include #include "paddle/fluid/framework/op_proto_maker.h" diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc similarity index 96% rename from paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc rename to paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc index 19056e18aa892dbc83dfbf7305b6ad8b6b6bc51c..7851e8c84bca2e3b05d3b1603eaa4c0ca5909e10 100644 --- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h" +#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h similarity index 100% rename from paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h rename to paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc similarity index 98% rename from paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc rename to paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc index 09d0b15f46a7e50afb6aea46383013ce6a6c6118..1783e3322b1df8125f580f09a12aefe64d246c1a 100644 --- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h" +#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h" #include diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc similarity index 95% rename from paddle/fluid/framework/ir/mkldnn_placement_pass.cc rename to paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc index 951fcb066ce759ebfec0182e1e9dca887e343170..20e52410ffe3caa86450bc05bf3aabf5a5bce374 100644 --- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/ir/mkldnn_placement_pass.h" +#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h" #include namespace paddle { diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h similarity index 100% rename from paddle/fluid/framework/ir/mkldnn_placement_pass.h rename to paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h index 19e5c2c73eac74dee030a4f7820531800f737e4e..4b55bd0703eee399cd841f90ea0b18d8fbdc67e8 100644 --- a/paddle/fluid/framework/op_info.h +++ b/paddle/fluid/framework/op_info.h @@ -38,6 +38,7 @@ struct OpInfo { OpAttrChecker* checker_{nullptr}; InferVarTypeFN infer_var_type_; InferShapeFN infer_shape_; + InferInplaceOpFN infer_inplace_; bool HasOpProtoAndChecker() const { return proto_ != nullptr && checker_ != nullptr; diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ab3cf308fc04e227d5402712f6bab226fea04711..9d6c10ab9e33d0e9888fa484030be9da7752512e 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -555,18 +555,17 @@ Tensor* ExecutionContext::LegacyOutput(const std::string& name) const { template <> std::vector ExecutionContext::MultiOutput( const std::string& name) const { - auto names = op().Outputs(name); + auto it = ctx_.outputs.find(name); + if (it == ctx_.outputs.end()) { + return {}; + } + const std::vector& vars = it->second; std::vector res; - res.reserve(names.size()); - std::transform(names.begin(), names.end(), std::back_inserter(res), - [&](const std::string& sub_name) -> Tensor* { - auto var = scope_.FindVar(sub_name); - if (var == nullptr) return nullptr; - PADDLE_ENFORCE( - var->IsType(), - "%s should be LoDTensor, but the received type is %s", - sub_name, ToTypeName(var->Type())); - return var->GetMutable(); + res.reserve(vars.size()); + std::transform(vars.begin(), vars.end(), std::back_inserter(res), + [&](Variable* var) -> Tensor* { + return var == nullptr ? nullptr + : var->GetMutable(); }); return res; } diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 938e2024c3359c2acd65a1aa4af875a8350e4c58..d02c699b979d7693bd83fd43fc73f7e0aeddb0cc 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -57,5 +57,8 @@ using InferVarTypeFN = using InferShapeFN = std::function; +using InplacePair = std::unordered_map; +using InferInplaceOpFN = std::function; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 5db422119966948f75970874e13d416ea699158a..ec8dedd605235a2d197e6a313bd589d5b9520cdf 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,5 +1,5 @@ if(WITH_PYTHON) -cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas) -cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context) +cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind) +cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind) cc_library(engine SRCS engine.cc) endif() diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 83fc6ee2e299f5fa18d5cc6f220c0be6a66e709d..47488d4dea79f285769f29c93f7888a7f783f070 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -156,6 +156,8 @@ class Autograd { for (auto it : candidate->pre_ops_) { for (OpBase* pre_op : it.second) { if (!pre_op) continue; + VLOG(5) << "op dep " << candidate->op_desc_->Type() << " <---- " + << it.first << " <---- " << pre_op->op_desc_->Type(); if (visited.find(pre_op) == visited.end()) { visited.insert(pre_op); queue.push_back(pre_op); diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index dc97433a5102b39d03ea5cac3157c027f9d67c98..78205486c5534ac0c61cc6d545bdafa4dfc95695 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -28,6 +28,7 @@ #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/imperative/type_defs.h" @@ -140,16 +141,24 @@ class VarBase { void RunBackward(); void TrackPreOp(OpBase* pre_op, const std::string& pre_op_out_name, - int pre_op_out_idx, bool stop_gradient) { + int pre_op_out_idx, bool pre_op_stop_gradient) { pre_op_ = pre_op; pre_op_out_name_ = pre_op_out_name; pre_op_out_idx_ = pre_op_out_idx; - stop_gradient_ = stop_gradient; + if (pre_op_stop_gradient) { + stop_gradient_ = pre_op_stop_gradient; + } } void ClearGradient() { - delete grads_; - grads_ = new VarBase(true); + VLOG(1) << "clear gradient of " << var_desc_->Name(); + if (grads_ && grads_->var_ && grads_->var_->IsInitialized()) { + auto grads_t = grads_->var_->GetMutable(); + operators::math::set_constant( + *(platform::DeviceContextPool::Instance().Get( + grads_->var_->Get().place())), + grads_t, 0.0); + } } framework::LoDTensor& GradValue(); diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index cd62807a5532e6b2309cb5a8f679c3097b51c9e9..bc39d11ba00a6a7c386162a1f9201c6f992c8692 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -31,6 +31,7 @@ void CreateGradOp(const framework::OpDesc& op_desc, framework::OpInfoMap::Instance() .Get(op_desc.Type()) .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); + for (auto& desc : descs) { grad_op_descs->emplace_back(desc.release()); } @@ -84,11 +85,12 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, op->input_vars_ = inputs; for (auto it : op->input_vars_) { auto& invars = invars_map[it.first]; + invars.reserve(it.second.size()); for (VarBase* inp : it.second) { PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", op->op_desc_->Type(), inp->var_desc_->Name()); - invars.push_back(inp->var_); + invars.emplace_back(inp->var_); vars[inp->var_desc_->Name()] = inp; if (inp->PreOp()) { op->pre_ops_[it.first].push_back(inp->PreOp()); @@ -105,9 +107,10 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, for (auto it : op->output_vars_) { auto& outvars = outvars_map[it.first]; const std::vector& outputs = it.second; + outvars.reserve(outputs.size()); for (size_t i = 0; i < outputs.size(); ++i) { VarBase* out = outputs[i]; - outvars.push_back(out->var_); + outvars.emplace_back(out->var_); vars[out->var_desc_->Name()] = out; framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name()); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 11484a647303b32a6006bef3cfe4be6b3f0d533d..157862016e3556902f6507e02417624363ed1029 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -58,12 +58,13 @@ if(WIN32) sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) - target_link_libraries(paddle_fluid_shared shlwapi) else(WIN32) cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) endif() +get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) +target_link_libraries(paddle_fluid_shared ${os_dependency_modules}) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) if(NOT APPLE AND NOT WIN32) diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index a2546ead93c3baeb8029f6451d8a60dcc75f8571..2f31b182af7293488719e41a92b2ea78709bda02 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -132,7 +132,7 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode, - contrib::AnalysisConfig::Precision); + AnalysisConfig::Precision); // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 120f6ef27d49ae59ec36304dc3742cd9ca0afa4b..59107f28080dceb0a58e17d42281db5f3773de56 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -32,7 +32,7 @@ limitations under the License. */ #ifdef _WIN32 #include #include -#define GCC_ATTRIBUTE(attr__) ; +#define GCC_ATTRIBUTE(attr__) #define MKDIR(path) _mkdir(path) #else #include diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 99611ce84b23896dd173831a03d77c6e0252d998..7476c199cfd073ec0962fa9a48f24750a6484bb5 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -71,7 +71,7 @@ void IRPassManager::CreatePasses(Argument *argument, new framework::ProgramDesc *(&argument->main_program())); bool enable_int8 = argument->tensorrt_precision_mode() == - contrib::AnalysisConfig::Precision::kInt8; + AnalysisConfig::Precision::kInt8; pass->Set("enable_int8", new bool(enable_int8)); std::string model_opt_cache_dir = @@ -83,7 +83,6 @@ void IRPassManager::CreatePasses(Argument *argument, new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); } - // graph_ = pass->Apply(std::move(graph_)); pre_pass = pass_name; passes_.emplace_back(std::move(pass)); @@ -97,8 +96,9 @@ std::unique_ptr IRPassManager::Apply(std::unique_ptr graph) { PADDLE_ENFORCE(graph.get()); // Apply all the passes for (const auto &pass : passes_) { - if (pass->Type() == "graph_viz_pass") continue; - PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type()); + if (pass->Type() != "graph_viz_pass") { + PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type()); + } graph = pass->Apply(std::move(graph)); } return std::move(graph); diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index eb6e1768a2c01f1388962eefe8e70368cae8cf8b..410a90132aa7657a23b858570763547fe53730a0 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -1,4 +1,7 @@ cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) +if(WITH_TESTING) + add_dependencies(subgraph_detector gtest) +endif() if (WITH_GPU AND TENSORRT_FOUND) cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller) diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index 3d1be9196fdeacd8ff852dbb595473a687352ccf..4b0a9d9b1c48fcb0d5e44ec1b977c817f3c70b2e 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include "paddle/fluid/framework/ir/graph_helper.h" @@ -168,7 +169,11 @@ bool FindSuitableTensorToReuse( if (!cluster->count(candidate)) continue; size_t space = space_table.at(candidate); - size_t space_diff = std::abs(space - space_required); + PADDLE_ENFORCE( + space <= std::numeric_limits::type>::max(), + "space overload"); + size_t space_diff = + std::abs((std::make_signed::type)space - space_required); if (space_diff < best_fit.second) { best_fit.first = candidate; best_fit.second = space_diff; diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h index 216f416de0d1003b944337ee98fb4e6a22c66fc5..2da565f2ae15a50a207173b10d4c350456086582 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h @@ -13,7 +13,9 @@ // limitations under the License. #pragma once - +#include +#include +#include #include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/platform/port.h" diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 8efd514bd8397f099fd07321ad7e5d4ca253e229..e92273b4dd94f11e0e90c91fd82dafe42bf158f3 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -22,7 +22,7 @@ namespace paddle { -PassStrategy *contrib::AnalysisConfig::pass_builder() const { +PassStrategy *AnalysisConfig::pass_builder() const { if (!pass_builder_.get()) { if (use_gpu_) { LOG(INFO) << "Create GPU IR passes"; @@ -42,27 +42,27 @@ PassStrategy *contrib::AnalysisConfig::pass_builder() const { return pass_builder_.get(); } -contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) { +AnalysisConfig::AnalysisConfig(const std::string &model_dir) { model_dir_ = model_dir; Update(); } -contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file, - const std::string ¶ms_file) { +AnalysisConfig::AnalysisConfig(const std::string &prog_file, + const std::string ¶ms_file) { prog_file_ = prog_file; params_file_ = params_file; Update(); } -void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path, - const std::string ¶ms_file_path) { +void AnalysisConfig::SetModel(const std::string &prog_file_path, + const std::string ¶ms_file_path) { prog_file_ = prog_file_path; params_file_ = params_file_path; Update(); } -void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, - int device_id) { +void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, + int device_id) { #ifdef PADDLE_WITH_CUDA use_gpu_ = true; memory_pool_init_size_mb_ = memory_pool_init_size_mb; @@ -74,13 +74,13 @@ void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, Update(); } -void contrib::AnalysisConfig::DisableGpu() { +void AnalysisConfig::DisableGpu() { use_gpu_ = false; Update(); } -contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { +AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { #define CP_MEMBER(member__) member__ = other.member__; // Model related. @@ -130,7 +130,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { Update(); } -void contrib::AnalysisConfig::EnableMKLDNN() { +void AnalysisConfig::EnableMKLDNN() { #ifdef PADDLE_WITH_MKLDNN pass_builder()->EnableMKLDNN(); use_mkldnn_ = true; @@ -142,9 +142,9 @@ void contrib::AnalysisConfig::EnableMKLDNN() { Update(); } -void contrib::AnalysisConfig::EnableTensorRtEngine( +void AnalysisConfig::EnableTensorRtEngine( int workspace_size, int max_batch_size, int min_subgraph_size, - contrib::AnalysisConfig::Precision precision_mode) { + AnalysisConfig::Precision precision_mode) { #ifdef PADDLE_WITH_CUDA if (!use_gpu()) { LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; @@ -165,7 +165,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine( } // TODO(Superjomn) refactor this, buggy. -void contrib::AnalysisConfig::Update() { +void AnalysisConfig::Update() { auto info = SerializeInfoCache(); if (info == serialized_info_cache_) return; @@ -225,7 +225,7 @@ void contrib::AnalysisConfig::Update() { } } -std::string contrib::AnalysisConfig::SerializeInfoCache() { +std::string AnalysisConfig::SerializeInfoCache() { std::stringstream ss; ss << model_dir_; ss << prog_file_; @@ -260,14 +260,14 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() { return ss.str(); } -void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads( +void AnalysisConfig::SetCpuMathLibraryNumThreads( int cpu_math_library_num_threads) { cpu_math_library_num_threads_ = cpu_math_library_num_threads; Update(); } -float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { +float AnalysisConfig::fraction_of_gpu_memory_for_pool() const { #ifdef PADDLE_WITH_CUDA // Get the GPU memory details and calculate the fraction of memory for the // GPU memory pool. @@ -282,8 +282,8 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { #endif } -void contrib::AnalysisConfig::EnableMemoryOptim( - bool static_optim, bool force_update_static_cache) { +void AnalysisConfig::EnableMemoryOptim(bool static_optim, + bool force_update_static_cache) { enable_memory_optim_ = true; static_memory_optim_ = static_optim; static_memory_optim_force_update_ = force_update_static_cache; @@ -291,14 +291,14 @@ void contrib::AnalysisConfig::EnableMemoryOptim( Update(); } -bool contrib::AnalysisConfig::enable_memory_optim() const { +bool AnalysisConfig::enable_memory_optim() const { return enable_memory_optim_; } -void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, - size_t prog_buffer_size, - const char *param_buffer, - size_t param_buffer_size) { +void AnalysisConfig::SetModelBuffer(const char *prog_buffer, + size_t prog_buffer_size, + const char *param_buffer, + size_t param_buffer_size) { prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size); params_file_ = std::string(param_buffer, param_buffer + param_buffer_size); model_from_memory_ = true; @@ -306,7 +306,7 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, Update(); } -NativeConfig contrib::AnalysisConfig::ToNativeConfig() const { +NativeConfig AnalysisConfig::ToNativeConfig() const { NativeConfig config; config.model_dir = model_dir_; config.prog_file = prog_file_; @@ -318,4 +318,9 @@ NativeConfig contrib::AnalysisConfig::ToNativeConfig() const { return config; } +void AnalysisConfig::SwitchIrDebug(int x) { + ir_debug_ = x; + Update(); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 66374cb7f07b3d9b6bfbff8382a3dfa7e8f2b04f..da2e9803f0467f2b83d79cdd06d4317d41630b04 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -47,7 +47,6 @@ DECLARE_bool(profile); namespace paddle { -using contrib::AnalysisConfig; using inference::Singleton; #if PADDLE_WITH_TENSORRT using inference::tensorrt::TRTInt8Calibrator; @@ -59,7 +58,8 @@ namespace { bool IsPersistable(const framework::VarDesc *var) { if (var->Persistable() && var->GetType() != framework::proto::VarType::FEED_MINIBATCH && - var->GetType() != framework::proto::VarType::FETCH_LIST) { + var->GetType() != framework::proto::VarType::FETCH_LIST && + var->GetType() != framework::proto::VarType::RAW) { return true; } return false; @@ -731,10 +731,10 @@ std::string AnalysisPredictor::GetSeriazlizedProgram() const { } template <> -std::unique_ptr CreatePaddlePredictor( - const contrib::AnalysisConfig &config) { - return CreatePaddlePredictor(config); +std::unique_ptr CreatePaddlePredictor( + const AnalysisConfig &config) { + return CreatePaddlePredictor( + config); } } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index fa1d0d596df5a3619af74e0fead3a0b376186e08..014df4ee8b6d86232212736c43a9aff32ffee011 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -33,7 +33,6 @@ using inference::analysis::Argument; using inference::analysis::Analyzer; using framework::proto::ProgramDesc; using framework::NaiveExecutor; -using contrib::AnalysisConfig; /** \brief This predictor is based on the original native predictor with IR and * Analysis support. @@ -123,7 +122,7 @@ class AnalysisPredictor : public PaddlePredictor { #endif private: - contrib::AnalysisConfig config_; + AnalysisConfig config_; Argument argument_; std::unique_ptr executor_; platform::Place place_; diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 20b61344da978a87baf654efd4ad2b3ae90454c0..002ba90e40e69d565f5a54e374a3f0083b84273f 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -24,7 +24,6 @@ DEFINE_string(dirname, "", "dirname to tests."); namespace paddle { -using contrib::AnalysisConfig; TEST(AnalysisPredictor, analysis_off) { AnalysisConfig config; @@ -197,7 +196,7 @@ TEST(AnalysisPredictor, memory_optim) { AnalysisConfig config(FLAGS_dirname); config.DisableGpu(); config.EnableMemoryOptim(true); - config.pass_builder()->TurnOnDebug(); + config.SwitchIrDebug(); auto native_predictor = CreatePaddlePredictor(config.ToNativeConfig()); diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 54895679ca37362c7267677af80274b8de95e296..e82cb53bf073d3d1ab9a518218edaf430728463f 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -295,7 +295,7 @@ TEST(inference_api_native, image_classification_gpu) { #endif TEST(PassBuilder, Delete) { - contrib::AnalysisConfig config; + AnalysisConfig config; config.DisableGpu(); config.pass_builder()->DeletePass("attention_lstm_fuse_pass"); const auto& passes = config.pass_builder()->AllPasses(); diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc index 338a0cec161f352781f132aea71dd56f68840c62..f7da55c9ae368763786c1b1fd3e86d942c5e9fe8 100644 --- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc @@ -36,7 +36,7 @@ namespace demo { */ void Main() { std::unique_ptr predictor; - paddle::contrib::AnalysisConfig config; + paddle::AnalysisConfig config; config.EnableUseGpu(100, 0); config.SetModel(FLAGS_modeldir + "/__model__", FLAGS_modeldir + "/__params__"); diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index 5320992b7e78f4aa0ea8950af03038c1953dd027..0d2c418c56db620c71d99b64ee79b18be427cc34 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -34,7 +34,6 @@ DEFINE_bool(use_gpu, false, "Whether use gpu."); namespace paddle { namespace demo { -using contrib::AnalysisConfig; /* * Use the native and analysis fluid engine to inference the demo. */ diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 5b899b26d60dec3634d7016c925143e1ae26992d..47361b3279e14dd65a0e6e7f864e508ef1183045 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -29,11 +29,6 @@ namespace paddle { class AnalysisPredictor; -// == -// -// ----------------------------------------------------------------------------------- -// NOTE: The following APIs are not mature yet, we are still working on them. -namespace contrib { // NOTE WIP, not stable yet. struct AnalysisConfig { @@ -145,9 +140,12 @@ struct AnalysisConfig { */ bool tensorrt_engine_enabled() const { return use_tensorrt_; } - /** Control whther to debug IR graph analysis phase. + /** \brief Control whether to debug IR graph analysis phase. + * + * This will generate DOT files for visualizing the computation graph after + * each analysis pass applied. */ - void SwitchIrDebug(int x = true) { ir_debug_ = x; } + void SwitchIrDebug(int x = true); /** Turn on MKLDNN. */ @@ -260,5 +258,4 @@ struct AnalysisConfig { mutable std::unique_ptr pass_builder_; }; -} // namespace contrib } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 406983224615fbdb649301f1ffe3fbd136938a61..8ac8bc529183edc2f8f888ca7ba14611acaadc10 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -221,7 +221,7 @@ class PaddlePredictor { virtual std::string GetSeriazlizedProgram() const { assert(false); // Force raise error. return "NotImplemented"; - }; + } /** The common configs for all the predictors. */ diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 391932a1ee018c45818457c55fd8f82a22ab7405..aa353f12ca7333713e2d640cce6b2dfbea3c4e26 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -117,6 +117,7 @@ class CpuPassStrategy : public PassStrategy { "conv_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", // "is_test_pass", // + "identity_scale_op_clean_pass", // }); use_gpu_ = false; } @@ -155,6 +156,7 @@ class GpuPassStrategy : public PassStrategy { GpuPassStrategy() : PassStrategy({}) { passes_.assign({ "infer_clean_graph_pass", // + "identity_scale_op_clean_pass", // "conv_affine_channel_fuse_pass", // "conv_eltwiseadd_affine_channel_fuse_pass", // "conv_bn_fuse_pass", // diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h index 919f5d55f88c3a6473f66371e2f3d91f3c4721c5..5815bc9a1464293e0a56f05e34183580eac96cea 100644 --- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h @@ -13,16 +13,16 @@ // limitations under the License. #pragma once + +#include +#include #include #include -#include +#include // NOLINT #include #include #include #include - -#include -#include #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 07b9e0e051bce13f6caeca54a664019c55d80fa6..7ecd9e35332843e3a391cdad5ce32220d890abd1 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -128,6 +128,11 @@ inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL) +# bert, max_len=20, embedding_dim=128 +set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128") +download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz") +inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc SERIAL) + # anakin if (WITH_ANAKIN AND WITH_MKL) # only needed in CI # anakin rnn1 diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..f646fd6d91c81b6738e4fc5278739307fa5f99b5 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc @@ -0,0 +1,223 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +using paddle::PaddleTensor; + +template +void GetValueFromStream(std::stringstream *ss, T *t) { + (*ss) >> (*t); +} + +template <> +void GetValueFromStream(std::stringstream *ss, std::string *t) { + *t = ss->str(); +} + +// Split string to vector +template +void Split(const std::string &line, char sep, std::vector *v) { + std::stringstream ss; + T t; + for (auto c : line) { + if (c != sep) { + ss << c; + } else { + GetValueFromStream(&ss, &t); + v->push_back(std::move(t)); + ss.str({}); + ss.clear(); + } + } + + if (!ss.str().empty()) { + GetValueFromStream(&ss, &t); + v->push_back(std::move(t)); + ss.str({}); + ss.clear(); + } +} + +template +constexpr paddle::PaddleDType GetPaddleDType(); + +template <> +constexpr paddle::PaddleDType GetPaddleDType() { + return paddle::PaddleDType::INT64; +} + +template <> +constexpr paddle::PaddleDType GetPaddleDType() { + return paddle::PaddleDType::FLOAT32; +} + +// Parse tensor from string +template +bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) { + std::vector data; + Split(field, ':', &data); + if (data.size() < 2) return false; + + std::string shape_str = data[0]; + + std::vector shape; + Split(shape_str, ' ', &shape); + + std::string mat_str = data[1]; + + std::vector mat; + Split(mat_str, ' ', &mat); + + tensor->shape = shape; + auto size = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()) * + sizeof(T); + tensor->data.Resize(size); + std::copy(mat.begin(), mat.end(), static_cast(tensor->data.data())); + tensor->dtype = GetPaddleDType(); + + return true; +} + +// Parse input tensors from string +bool ParseLine(const std::string &line, + std::vector *tensors) { + std::vector fields; + Split(line, ';', &fields); + + if (fields.size() < 5) return false; + + tensors->clear(); + tensors->reserve(5); + + int i = 0; + // src_id + paddle::PaddleTensor src_id; + ParseTensor(fields[i++], &src_id); + tensors->push_back(src_id); + + // pos_id + paddle::PaddleTensor pos_id; + ParseTensor(fields[i++], &pos_id); + tensors->push_back(pos_id); + + // segment_id + paddle::PaddleTensor segment_id; + ParseTensor(fields[i++], &segment_id); + tensors->push_back(segment_id); + + // self_attention_bias + paddle::PaddleTensor self_attention_bias; + ParseTensor(fields[i++], &self_attention_bias); + tensors->push_back(self_attention_bias); + + // next_segment_index + paddle::PaddleTensor next_segment_index; + ParseTensor(fields[i++], &next_segment_index); + tensors->push_back(next_segment_index); + + return true; +} + +bool LoadInputData(std::vector> *inputs) { + if (FLAGS_infer_data.empty()) { + LOG(ERROR) << "please set input data path"; + return false; + } + + std::ifstream fin(FLAGS_infer_data); + std::string line; + int sample = 0; + + // The unit-test dataset only have 10 samples, each sample have 5 feeds. + while (std::getline(fin, line)) { + std::vector feed_data; + ParseLine(line, &feed_data); + inputs->push_back(std::move(feed_data)); + sample++; + if (!FLAGS_test_all_data && sample == FLAGS_batch_size) break; + } + LOG(INFO) << "number of samples: " << sample; + + return true; +} + +void SetConfig(AnalysisConfig *config) { config->SetModel(FLAGS_infer_model); } + +void profile(bool use_mkldnn = false) { + AnalysisConfig config; + SetConfig(&config); + + if (use_mkldnn) { + config.EnableMKLDNN(); + } + + std::vector outputs; + std::vector> inputs; + LoadInputData(&inputs); + TestPrediction(reinterpret_cast(&config), + inputs, &outputs, FLAGS_num_threads); +} + +TEST(Analyzer_bert, profile) { profile(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_bert, profile_mkldnn) { profile(true); } +#endif + +// Check the fuse status +TEST(Analyzer_bert, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); + LOG(INFO) << "num_ops: " << num_ops; +} + +// Compare result of NativeConfig and AnalysisConfig +void compare(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + if (use_mkldnn) { + cfg.EnableMKLDNN(); + } + + std::vector> inputs; + LoadInputData(&inputs); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), inputs); +} + +TEST(Analyzer_bert, compare) { compare(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_bert, compare_mkldnn) { compare(true /* use_mkldnn */); } +#endif + +// Compare Deterministic result +TEST(Analyzer_bert, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> inputs; + LoadInputData(&inputs); + CompareDeterministic(reinterpret_cast(&cfg), + inputs); +} +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index e78ab942d113323fecf5510dca85fb5db734efc8..735e4fb563788438ee49ff6308d11f4dbe4962be 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -19,7 +19,6 @@ DEFINE_int32(max_turn_num, 9, namespace paddle { namespace inference { -using contrib::AnalysisConfig; constexpr int32_t kMaxTurnLen = 50; @@ -165,7 +164,7 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, input_slots->push_back(std::move(response_mask_tensor)); } -void SetConfig(contrib::AnalysisConfig *cfg) { +void SetConfig(AnalysisConfig *cfg) { cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param"); cfg->SwitchSpecifyInputNames(); cfg->SwitchIrOptim(true); @@ -187,7 +186,7 @@ void SetInput(std::vector> *inputs) { // Easy for profiling independently. void profile(bool use_mkldnn = false) { - contrib::AnalysisConfig cfg; + AnalysisConfig cfg; SetConfig(&cfg); if (use_mkldnn) { @@ -223,7 +222,7 @@ TEST(Analyzer_dam, profile_mkldnn) { profile(true /* use_mkldnn */); } // Check the fuse status TEST(Analyzer_dam, fuse_statis) { - contrib::AnalysisConfig cfg; + AnalysisConfig cfg; SetConfig(&cfg); int num_ops; @@ -256,7 +255,7 @@ void compare(bool use_mkldnn = false) { TEST(Analyzer_dam, compare_with_static_memory_optim) { // The small dam will core in CI, but works in local. if (FLAGS_max_turn_num == 9) { - contrib::AnalysisConfig cfg, cfg1; + AnalysisConfig cfg, cfg1; DataRecord data(FLAGS_infer_data, FLAGS_batch_size); std::vector> input_slots_all; @@ -282,7 +281,7 @@ TEST(Analyzer_dam, compare_with_static_memory_optim) { TEST(Analyzer_dam, compare_with_dynamic_memory_optim) { // The small dam will core in CI, but works in local. if (FLAGS_max_turn_num == 9) { - contrib::AnalysisConfig cfg, cfg1; + AnalysisConfig cfg, cfg1; DataRecord data(FLAGS_infer_data, FLAGS_batch_size); std::vector> input_slots_all; diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index b9666e01adb23e0cbd9257bc55081c3a5001e887..347672eaae314aa42096d48a3b044014f2ddbf84 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -18,8 +18,6 @@ namespace paddle { namespace inference { namespace analysis { -using contrib::AnalysisConfig; - struct DataRecord { std::vector data; std::vector lod; diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc index 529a0174c8542f5226e70ef4a47bde069220ecc2..089f655c180d784af66af60277bdbf32a6019599 100644 --- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc @@ -16,7 +16,6 @@ namespace paddle { namespace inference { -using contrib::AnalysisConfig; struct DataRecord { std::vector> query, title; @@ -75,7 +74,7 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -void SetConfig(contrib::AnalysisConfig *cfg) { +void SetConfig(AnalysisConfig *cfg) { cfg->SetModel(FLAGS_infer_model); cfg->DisableGpu(); cfg->SwitchSpecifyInputNames(); @@ -95,7 +94,7 @@ void SetInput(std::vector> *inputs) { // Easy for profiling independently. void profile(bool use_mkldnn = false) { - contrib::AnalysisConfig cfg; + AnalysisConfig cfg; SetConfig(&cfg); std::vector outputs; @@ -130,7 +129,7 @@ TEST(Analyzer_MM_DNN, profile_mkldnn) { profile(true /* use_mkldnn */); } // Check the fuse status TEST(Analyzer_MM_DNN, fuse_statis) { - contrib::AnalysisConfig cfg; + AnalysisConfig cfg; SetConfig(&cfg); int num_ops; @@ -141,7 +140,7 @@ TEST(Analyzer_MM_DNN, fuse_statis) { // Compare result of NativeConfig and AnalysisConfig void compare(bool use_mkldnn = false) { - contrib::AnalysisConfig cfg; + AnalysisConfig cfg; SetConfig(&cfg); if (use_mkldnn) { diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 6fef79dc4608acd6eee679ad4939e7684db98f5b..a70aa7a6ac41121a0c8ea397ebc7e24e4b206d12 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -16,7 +16,6 @@ namespace paddle { namespace inference { -using contrib::AnalysisConfig; struct DataRecord { std::vector> word, mention; @@ -76,7 +75,7 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data) { } } -void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) { +void SetConfig(AnalysisConfig *cfg, bool memory_load = false) { if (memory_load) { std::string buffer_prog, buffer_param; ReadBinaryFile(FLAGS_infer_model + "/__model__", &buffer_prog); @@ -105,7 +104,7 @@ void SetInput(std::vector> *inputs) { // Easy for profiling independently. void profile(bool memory_load = false) { - contrib::AnalysisConfig cfg; + AnalysisConfig cfg; SetConfig(&cfg, memory_load); std::vector outputs; @@ -136,7 +135,7 @@ TEST(Analyzer_Chinese_ner, profile_memory_load) { // Check the fuse status TEST(Analyzer_Chinese_ner, fuse_statis) { - contrib::AnalysisConfig cfg; + AnalysisConfig cfg; SetConfig(&cfg); int num_ops; @@ -152,7 +151,7 @@ TEST(Analyzer_Chinese_ner, fuse_statis) { // Compare result of NativeConfig and AnalysisConfig TEST(Analyzer_Chinese_ner, compare) { - contrib::AnalysisConfig cfg; + AnalysisConfig cfg; SetConfig(&cfg); std::vector> input_slots_all; diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc index ad2c46e48d5a34a457a615f313f1ac3cc916b200..3f6c933f2bcc6ed5410cb95a48f5ee6869280fe4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc @@ -16,7 +16,6 @@ namespace paddle { namespace inference { -using contrib::AnalysisConfig; struct DataRecord { std::vector> query_basic, query_phrase, title_basic, @@ -103,7 +102,7 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -void SetConfig(contrib::AnalysisConfig *cfg) { +void SetConfig(AnalysisConfig *cfg) { cfg->SetModel(FLAGS_infer_model); cfg->DisableGpu(); cfg->SwitchSpecifyInputNames(); @@ -123,7 +122,7 @@ void SetInput(std::vector> *inputs) { // Easy for profiling independently. TEST(Analyzer_Pyramid_DNN, profile) { - contrib::AnalysisConfig cfg; + AnalysisConfig cfg; SetConfig(&cfg); std::vector outputs; @@ -147,7 +146,7 @@ TEST(Analyzer_Pyramid_DNN, profile) { // Check the fuse status TEST(Analyzer_Pyramid_DNN, fuse_statis) { - contrib::AnalysisConfig cfg; + AnalysisConfig cfg; SetConfig(&cfg); int num_ops; @@ -158,7 +157,7 @@ TEST(Analyzer_Pyramid_DNN, fuse_statis) { // Compare result of NativeConfig and AnalysisConfig TEST(Analyzer_Pyramid_DNN, compare) { - contrib::AnalysisConfig cfg; + AnalysisConfig cfg; SetConfig(&cfg); std::vector> input_slots_all; diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 22e6366fb5cba6c7a0cde9c0c5f50f56c2e23b05..c27c39f40a2067dd2bd2150e4b1e53eab7cdf06e 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -20,7 +20,6 @@ namespace paddle { namespace inference { using namespace framework; // NOLINT -using namespace contrib; // NOLINT struct DataRecord { std::vector>> link_step_data_all; @@ -223,7 +222,7 @@ void SetInput(std::vector> *inputs) { // Easy for profiling independently. TEST(Analyzer_rnn1, profile) { - contrib::AnalysisConfig cfg; + AnalysisConfig cfg; SetConfig(&cfg); cfg.DisableGpu(); cfg.SwitchIrDebug(); @@ -237,7 +236,7 @@ TEST(Analyzer_rnn1, profile) { // Check the fuse status TEST(Analyzer_rnn1, fuse_statis) { - contrib::AnalysisConfig cfg; + AnalysisConfig cfg; SetConfig(&cfg); int num_ops; @@ -254,7 +253,7 @@ TEST(Analyzer_rnn1, fuse_statis) { // Compare result of NativeConfig and AnalysisConfig TEST(Analyzer_rnn1, compare) { - contrib::AnalysisConfig cfg; + AnalysisConfig cfg; SetConfig(&cfg); std::vector> input_slots_all; @@ -276,7 +275,7 @@ TEST(Analyzer_rnn1, compare_determine) { // Test Multi-Thread. TEST(Analyzer_rnn1, multi_thread) { - contrib::AnalysisConfig cfg; + AnalysisConfig cfg; SetConfig(&cfg); std::vector outputs; diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index 8be2a6d79b2ede2c149aa523e38c3960ab30acb1..dd953e0dccbb3749bfcc87966453c6976dfefa10 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -142,7 +142,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) { cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); cfg->DisableGpu(); cfg->SwitchSpecifyInputNames(); - cfg->pass_builder()->TurnOnDebug(); + cfg->SwitchIrDebug(); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); if (use_mkldnn) { cfg->EnableMKLDNN(); diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 2db297e2005c6b657259187d6b6b76657d9e4388..2003be82019333ca97b9fa8ef83668825fe5710d 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -69,7 +69,7 @@ void SetInput(std::vector> *inputs) { TEST(Analyzer_Text_Classification, profile) { AnalysisConfig cfg; SetConfig(&cfg); - cfg.pass_builder()->TurnOnDebug(); + cfg.SwitchIrDebug(); std::vector outputs; std::vector> input_slots_all; diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index f3e75ffbb5962885bd926af50b764bec561cc454..ca04c1365cbbffcb4a2786cde9ab240cc20aa3d8 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -20,7 +20,6 @@ limitations under the License. */ namespace paddle { namespace inference { namespace analysis { -using contrib::AnalysisConfig; struct Record { std::vector data; diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index ecc10bafd650e52dfb73e8dd4329c697ff4f4ccc..b0c23fbd534847c8aad244749761e9c072148796 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -58,9 +58,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) { return os; } -std::ostream &operator<<(std::ostream &os, - const contrib::AnalysisConfig &config) { - os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n"; +std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) { + os << GenSpaces(num_spaces) << "AnalysisConfig {\n"; num_spaces++; os << config.ToNativeConfig(); if (!config.model_from_memory()) { diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index b1f7a3464ac6027faffe283bccaf9793eae939e1..2811eb4946ea025cf6c7ab197c4e603df86f6f2d 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -56,16 +56,9 @@ DECLARE_int32(paddle_num_threads); namespace paddle { namespace inference { -float Random(float low, float high) { - static std::random_device rd; - static std::mt19937 mt(rd()); - std::uniform_real_distribution dist(low, high); - return dist(mt); -} - void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { const auto *analysis_config = - reinterpret_cast(config); + reinterpret_cast(config); if (use_analysis) { LOG(INFO) << *analysis_config; return; @@ -109,9 +102,9 @@ void CompareResult(const std::vector &outputs, std::unique_ptr CreateTestPredictor( const PaddlePredictor::Config *config, bool use_analysis = true) { const auto *analysis_config = - reinterpret_cast(config); + reinterpret_cast(config); if (use_analysis) { - return CreatePaddlePredictor(*analysis_config); + return CreatePaddlePredictor(*analysis_config); } auto native_config = analysis_config->ToNativeConfig(); return CreatePaddlePredictor(native_config); @@ -146,7 +139,8 @@ void SetFakeImageInput(std::vector> *inputs, const std::string &dirname, bool is_combined = true, std::string model_filename = "model", std::string params_filename = "params", - const std::vector *feed_names = nullptr) { + const std::vector *feed_names = nullptr, + const int continuous_inuput_index = 0) { // Set fake_image_data PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); std::vector> feed_target_shapes = GetFeedTargetShapes( @@ -183,7 +177,8 @@ void SetFakeImageInput(std::vector> *inputs, float *input_data = static_cast(input.data.data()); // fill input data, for profile easily, do not use random data here. for (size_t j = 0; j < len; ++j) { - *(input_data + j) = Random(0.0, 1.0) / 10.; + *(input_data + j) = + static_cast((j + continuous_inuput_index) % len) / len; } } (*inputs).emplace_back(input_slots); diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index db7109b7505d4fe4dcfcf88f303aa262bc5b44fb..17a433c9d98768dbda4ba93bdceb6cc1717adc07 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -42,9 +42,9 @@ void SetConfig(ConfigType* config, std::string model_dir, bool use_gpu, } template <> -void SetConfig(contrib::AnalysisConfig* config, - std::string model_dir, bool use_gpu, - bool use_tensorrt, int batch_size) { +void SetConfig(AnalysisConfig* config, std::string model_dir, + bool use_gpu, bool use_tensorrt, + int batch_size) { if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { config->SetModel(model_dir + "/" + FLAGS_prog_filename, model_dir + "/" + FLAGS_param_filename); @@ -75,11 +75,11 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { std::vector outputs; if (use_analysis || use_tensorrt) { - contrib::AnalysisConfig config; + AnalysisConfig config; config.EnableUseGpu(100, 0); config.pass_builder()->TurnOnDebug(); - SetConfig(&config, model_dir, true, use_tensorrt, - FLAGS_batch_size); + SetConfig(&config, model_dir, true, use_tensorrt, + FLAGS_batch_size); TestPrediction(reinterpret_cast(&config), inputs_all, &outputs, FLAGS_num_threads, true); } else { @@ -99,18 +99,18 @@ void compare(std::string model_dir, bool use_tensorrt) { SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); } - contrib::AnalysisConfig analysis_config; - SetConfig(&analysis_config, model_dir, true, - use_tensorrt, FLAGS_batch_size); + AnalysisConfig analysis_config; + SetConfig(&analysis_config, model_dir, true, use_tensorrt, + FLAGS_batch_size); CompareNativeAndAnalysis( reinterpret_cast(&analysis_config), inputs_all); } void compare_continuous_input(std::string model_dir, bool use_tensorrt) { - contrib::AnalysisConfig analysis_config; - SetConfig(&analysis_config, model_dir, true, - use_tensorrt, FLAGS_batch_size); + AnalysisConfig analysis_config; + SetConfig(&analysis_config, model_dir, true, use_tensorrt, + FLAGS_batch_size); auto config = reinterpret_cast(&analysis_config); auto native_pred = CreateTestPredictor(config, false); @@ -119,9 +119,10 @@ void compare_continuous_input(std::string model_dir, bool use_tensorrt) { std::vector> inputs_all; if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename, - FLAGS_param_filename); + FLAGS_param_filename, nullptr, i); } else { - SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "", nullptr, + i); } CompareNativeAndAnalysis(native_pred.get(), analysis_pred.get(), inputs_all); diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc index 80763160df3adfd8c34e66bc7a5370808b349e76..0c48c2db9b691ae8cf587f2729c2789d4ce2dbe1 100644 --- a/paddle/fluid/inference/utils/benchmark_tester.cc +++ b/paddle/fluid/inference/utils/benchmark_tester.cc @@ -34,6 +34,6 @@ TEST(Benchmark, PersistToFile) { benchmark.SetLatency(220); benchmark.PersistToFile("1.log"); - benchmark.PersistToFile("1.log"); - benchmark.PersistToFile("1.log"); + benchmark.PersistToFile("2.log"); + benchmark.PersistToFile("3.log"); } diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 5d8684f083bda8499000c9fd0a7617cf129db13b..327adcc4aac1c50b51942c557d66dae6770e24f2 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -13,9 +13,15 @@ // limitations under the License. #include "paddle/fluid/memory/allocation/legacy_allocator.h" + #include #include #include + +#ifdef PADDLE_WITH_JEMALLOC +#include +#endif + #include "glog/logging.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h" @@ -53,11 +59,6 @@ size_t memory_usage(const platform::Place &p); using BuddyAllocator = detail::BuddyAllocator; -std::unordered_map> - gpu_mem_info; - BuddyAllocator *GetCPUBuddyAllocator() { // We tried thread_local for inference::RNN1 model, but that not works much // for multi-thread test. @@ -95,7 +96,11 @@ struct NaiveAllocator { template <> void *Alloc(const platform::CPUPlace &place, size_t size) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); +#ifdef PADDLE_WITH_JEMALLOC + void *p = malloc(size); +#else void *p = GetCPUBuddyAllocator()->Alloc(size); +#endif if (FLAGS_init_allocated_mem) { memset(p, 0xEF, size); } @@ -107,12 +112,21 @@ template <> void Free(const platform::CPUPlace &place, void *p, size_t size) { VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); +#ifdef PADDLE_WITH_JEMALLOC + free(p); +#else GetCPUBuddyAllocator()->Free(p); +#endif } template <> size_t Used(const platform::CPUPlace &place) { +#ifdef PADDLE_WITH_JEMALLOC + // fake the result of used memory when PADDLE_WITH_JEMALLOC is ON + return 0U; +#else return GetCPUBuddyAllocator()->Used(); +#endif } #ifdef PADDLE_WITH_CUDA @@ -125,6 +139,8 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { devices = platform::GetSelectedDevices(); int gpu_num = devices.size(); + allocation::GPUMemMonitor.Initialize(devices.size()); + a_arr = new BuddyAllocator *[gpu_num]; for (size_t i = 0; i < devices.size(); ++i) { int dev_id = devices[i]; @@ -171,25 +187,19 @@ void *Alloc(const platform::CUDAPlace &place, platform::SetDeviceId(place.device); size_t avail, total; platform::GpuMemoryUsage(&avail, &total); - LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size) - << " in GPU " << place.device << ", available " - << string::HumanReadableSize(avail); - LOG(WARNING) << "total " << total; - LOG(WARNING) << "GpuMinChunkSize " - << string::HumanReadableSize( - buddy_allocator->GetMinChunkSize()); - LOG(WARNING) << "GpuMaxChunkSize " - << string::HumanReadableSize( - buddy_allocator->GetMaxChunkSize()); - LOG(WARNING) << "GPU memory used: " - << string::HumanReadableSize(Used(place)); + LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size) + << " in GPU " << place.device << ", available " + << string::HumanReadableSize(avail) << "total " << total + << "GpuMinChunkSize " + << string::HumanReadableSize(buddy_allocator->GetMinChunkSize()) + << "GpuMaxChunkSize " + << string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()) + << "GPU memory used: " + << string::HumanReadableSize(Used(place)); platform::SetDeviceId(cur_dev); } else { - gpu_mem_info[place.device].first += size; - if (gpu_mem_info[place.device].first > gpu_mem_info[place.device].second) { - gpu_mem_info[place.device].second = gpu_mem_info[place.device].first; - VLOG(3) << "device: " << place.device << " peak memory usage : " - << (gpu_mem_info[place.device].second >> 20) << " MiB"; + if (VLOG_IS_ON(3)) { + allocation::GPUMemMonitor.Add(place.device, size); } if (FLAGS_init_allocated_mem) { cudaMemset(ptr, 0xEF, size); @@ -206,7 +216,9 @@ void Free(const platform::CUDAPlace &place, void *p, size_t size) { #ifdef PADDLE_WITH_CUDA GetGPUBuddyAllocator(place.device)->Free(p); - gpu_mem_info[place.device].first -= size; + if (VLOG_IS_ON(3)) { + allocation::GPUMemMonitor.Minus(place.device, size); + } #else PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); #endif @@ -316,6 +328,8 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { namespace allocation { +LegacyMemMonitor GPUMemMonitor; + Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_); return new Allocation(ptr, size, place_); @@ -327,6 +341,63 @@ void LegacyAllocator::Free(Allocation *allocation) { allocation->place()); delete allocation; } + +bool MemInfo::Add(const size_t &size) { + std::lock_guard lock(mutex_); + usage_ += size; + bool peak_point = usage_ > peak_usage_; + if (peak_point) peak_usage_ = usage_; + return peak_point; +} + +void MemInfo::Minus(const size_t &size) { + std::lock_guard lock(mutex_); + usage_ -= size; +} + +uint64_t MemInfo::GetPeakUsage() { return peak_usage_; } + +LegacyMemMonitor::~LegacyMemMonitor() { + for (auto &item : gpu_mem_info_) delete item.second; +} + +void LegacyMemMonitor::Initialize(const int &device_num) { + for (auto i = 0; i < device_num; ++i) { + gpu_mem_info_[i] = new MemInfo(); + } +} + +void LegacyMemMonitor::Add(const int &device, const size_t &size) { + if (gpu_mem_info_[device]->Add(size)) { + VLOG(3) << "#LegacyMemMonitor# device: " << device + << " peak memory usage : " + << (gpu_mem_info_[device]->GetPeakUsage() >> 20) << " MiB"; + } +} + +void LegacyMemMonitor::Minus(const int &device, const size_t &size) { + gpu_mem_info_[device]->Minus(size); +} + +uint64_t LegacyMemMonitor::GetMemUsage(const int &device) { + return gpu_mem_info_.find(device) == gpu_mem_info_.end() + ? 0 + : gpu_mem_info_[device]->GetPeakUsage(); +} + +void LegacyMemMonitor::PrintMemUsage() { + std::vector devices; + for (const auto &item : gpu_mem_info_) { + devices.emplace_back(item.first); + } + std::sort(devices.begin(), devices.end()); + for (const auto &device : devices) { + std::cout << "Device : " << device << " Peak Memory Usage : " + << (gpu_mem_info_[device]->GetPeakUsage() >> 20) << " MiB" + << std::endl; + } +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h index 503a7a685cb9d8dbbbbd6c23b5b82c383893e3d8..ccbc8c70d8e9a16e7edb1be54bf80bec3b368eca 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.h +++ b/paddle/fluid/memory/allocation/legacy_allocator.h @@ -13,12 +13,59 @@ // limitations under the License. #pragma once +#include +#include // NOLINT +#include +#include +#include #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/place.h" namespace paddle { namespace memory { namespace allocation { +class MemInfo { + public: + MemInfo() : usage_(0), peak_usage_(0) {} + MemInfo(const MemInfo &) = delete; + MemInfo &operator=(const MemInfo &) = delete; + + // return a flag to indicate current operation will create a peak point or not + bool Add(const size_t &); + void Minus(const size_t &); + + uint64_t GetPeakUsage(); + + private: + /* current memory usage*/ + uint64_t usage_; + uint64_t peak_usage_; + std::mutex mutex_; +}; + +class LegacyMemMonitor { + public: + // used to store the GPU memory usage of each devices + using MemUsage = std::unordered_map; + + MemUsage GetMemUsageInfo() { return gpu_mem_info_; } + ~LegacyMemMonitor(); + + void Initialize(const int &); + void Add(const int &, const size_t &); + void Minus(const int &, const size_t &); + + uint64_t GetMemUsage(const int &); + + void PrintMemUsage(); + + protected: + MemUsage gpu_mem_info_; +}; + +extern LegacyMemMonitor GPUMemMonitor; + class LegacyAllocatorPrivate; class LegacyAllocator : public Allocator { public: diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 9c5b8604f40ae56c463b54c71623feb61bd8d297..189db2317d0544014d9c74e0fd5e9ead54925b9c 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" #include -#include "paddle/fluid/operators/mkldnn_activation_op.h" +#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" #include "paddle/fluid/platform/port.h" namespace paddle { @@ -547,12 +547,14 @@ namespace ops = paddle::operators; __macro(Swish, swish); \ __macro(ThresholdedRelu, thresholded_relu); -#define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE) \ - REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \ - ::paddle::operators::OP_NAME##OpMaker, \ - ::paddle::operators::ActivationOpInferVarType, \ - ::paddle::operators::OP_NAME##GradMaker); \ - REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad) +#define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE) \ + REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \ + ::paddle::operators::OP_NAME##OpMaker, \ + ::paddle::operators::ActivationOpInferVarType, \ + ::paddle::operators::OP_NAME##GradMaker, \ + ::paddle::framework::SingleOpInplaceInToOut); \ + REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad, \ + ::paddle::framework::SingleOpInplaceInToOut) #define REGISTER_ACTIVATION_OP(OP_NAME, KERNEL_TYPE) \ REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \ diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 8b672e09b2c5c203c1a1447fbbd14a45ef7ba257..feac4125381bd897dac89943af44850012e4761d 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -589,8 +589,10 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker { op->SetInput("SavedVariance", Output("SavedVariance")); // used when setting use_global_stats True during training - op->SetInput("Mean", Output("MeanOut")); - op->SetInput("Variance", Output("VarianceOut")); + if (boost::get(GetAttr("use_global_stats"))) { + op->SetInput("Mean", Output("MeanOut")); + op->SetInput("Variance", Output("VarianceOut")); + } op->SetAttrMap(Attrs()); @@ -602,13 +604,48 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker { } }; +class BatchNormInplaceInToOut : public framework::InplaceInToOut { + public: + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map inplace_in_to_out = { + {"Mean", "MeanOut"}, {"Variance", "VarianceOut"}, {"X", "Y"}, + }; + return inplace_in_to_out; + } +}; + +class BatchNormGradInplaceInToOut : public framework::InplaceInToOut { + public: + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map inplace_in_to_out = { + // Scale, Bias, SavedMean, SavedVariance shape is [batch_size, C] + {framework::GradVarName("Y"), framework::GradVarName("X")}, + {"SavedMean", framework::GradVarName("Scale")}, + {"SavedVariance", framework::GradVarName("Bias")}, + }; + return inplace_in_to_out; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, - ops::BatchNormOpInferVarType, ops::BatchNormGradMaker); -REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp); + ops::BatchNormOpInferVarType, ops::BatchNormGradMaker, + ops::BatchNormInplaceInToOut); +REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp, + ops::BatchNormGradInplaceInToOut); REGISTER_OP_CPU_KERNEL( batch_norm, ops::BatchNormKernel, diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc index e78ecc1a12309fe084a4165e5bb0d8bfb1dcf957..e93cd8615e052e4dfc6255549bf7a9b84b7dd657 100644 --- a/paddle/fluid/operators/beam_search_op.cc +++ b/paddle/fluid/operators/beam_search_op.cc @@ -51,6 +51,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("selected_scores", "A LoDTensor containing the accumulated scores corresponding to " "Output(selected_ids)."); + AddOutput( + "parent_idx", + "A Tensor preserving the selected_ids' parent indice in pre_ids."); // Attributes stored in AttributeMap AddAttr("level", "the level of LoDTensor"); diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h index 1b939e742de06aedf187d25d002d19e0a4fafc9d..f808020cc765585d1633c6c3bf528080a7e83f07 100644 --- a/paddle/fluid/operators/beam_search_op.h +++ b/paddle/fluid/operators/beam_search_op.h @@ -41,13 +41,15 @@ class BeamSearchOpKernel : public framework::OpKernel { auto selected_ids = context.Output("selected_ids"); auto selected_scores = context.Output("selected_scores"); + auto* parent_idx = context.Output("parent_idx"); PADDLE_ENFORCE_NOT_NULL(selected_ids); PADDLE_ENFORCE_NOT_NULL(selected_scores); + PADDLE_ENFORCE_NOT_NULL(parent_idx); math::BeamSearchFunctor alg; alg(context.template device_context(), pre_ids, pre_scores, - ids, scores, selected_ids, selected_scores, level, beam_size, end_id, - is_accumulated); + ids, scores, selected_ids, selected_scores, parent_idx, level, + beam_size, end_id, is_accumulated); } }; diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index d3a61dc367c3642b8faa9085a470a302712395e5..f6fbe97565c43c306ea885c765c0a665492fa317 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -31,6 +31,8 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc polygon_box_transform_op.cu) detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) +detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu) +detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc) if(WITH_GPU) detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub) diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h index b99edb5bf05f94e762b377a8882e4c3fcdb5afad..a7bc3e027229884e78721d29428a8ab3f08a6ebc 100644 --- a/paddle/fluid/operators/detection/bbox_util.h +++ b/paddle/fluid/operators/detection/bbox_util.h @@ -99,5 +99,29 @@ void BboxOverlaps(const framework::Tensor& r_boxes, } } +template +void ClipTiledBoxes(const platform::DeviceContext& ctx, + const framework::Tensor& im_info, + const framework::Tensor& input_boxes, + framework::Tensor* out) { + T* out_data = out->mutable_data(ctx.GetPlace()); + const T* im_info_data = im_info.data(); + const T* input_boxes_data = input_boxes.data(); + T zero(0); + T im_w = round(im_info_data[1] / im_info_data[2]); + T im_h = round(im_info_data[0] / im_info_data[2]); + for (int64_t i = 0; i < input_boxes.numel(); ++i) { + if (i % 4 == 0) { + out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero); + } else if (i % 4 == 1) { + out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero); + } else if (i % 4 == 2) { + out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero); + } else { + out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero); + } + } +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detection/box_clip_op.cc b/paddle/fluid/operators/detection/box_clip_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3aa766559a530bc31fbb277f2bcd474da776e63b --- /dev/null +++ b/paddle/fluid/operators/detection/box_clip_op.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/box_clip_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class BoxClipOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of BoxClipOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), + "Input(ImInfo) of BoxClipOp should not be null."); + + auto input_box_dims = ctx->GetInputDim("Input"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); + + if (ctx->IsRuntime()) { + auto input_box_size = input_box_dims.size(); + PADDLE_ENFORCE_EQ(input_box_dims[input_box_size - 1], 4, + "The last dimension of Input must be 4"); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(Input) in BoxClipOp must be 2"); + PADDLE_ENFORCE_EQ(im_info_dims[1], 3, + "The last dimension of ImInfo must be 3"); + } + ctx->ShareDim("Input", /*->*/ "Output"); + ctx->ShareLoD("Input", /*->*/ "Output"); + } +}; + +class BoxClipOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", + "(LoDTensor) " + "Input is a LoDTensor with shape [..., 4] holds 4 points" + "in last dimension in format [xmin, ymin, xmax, ymax]"); + AddInput("ImInfo", + "(Tensor) Information for image reshape is in shape (N, 3), " + "in format (height, width, im_scale)"); + AddOutput("Output", + "(LoDTensor) " + "Output is a LoDTensor with the same shape as Input" + "and it is the result after clip"); + AddComment(R"DOC( +This operator clips input boxes to original input images. + +For each input box, The formula is given as follows: + + $$xmin = \max(\min(xmin, im_w - 1), 0)$$ + $$ymin = \max(\min(ymin, im_h - 1), 0)$$ + $$xmax = \max(\min(xmax, im_w - 1), 0)$$ + $$ymax = \max(\min(ymax, im_h - 1), 0)$$ + +where im_w and im_h are computed from ImInfo, the formula is given as follows: + + $$im_w = \round(width / im_scale)$$ + $$im_h = \round(height / im_scale)$$ +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(box_clip, ops::BoxClipOp, ops::BoxClipOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + box_clip, ops::BoxClipKernel, + ops::BoxClipKernel); diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..b727da5f7b736b6f22407d1dfbca708ed0cf04d9 --- /dev/null +++ b/paddle/fluid/operators/detection/box_clip_op.cu @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/box_clip_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTenso = framework::LoDTensor; + +static constexpr int ImInfoSize = 3; + +template +static __global__ void GPUBoxClip(const T *input, const size_t *lod, + const size_t width, const T *im_info, + T *output) { + T im_w = round(im_info[blockIdx.x * ImInfoSize + 1] / + im_info[blockIdx.x * ImInfoSize + 2]); + T im_h = round(im_info[blockIdx.x * ImInfoSize] / + im_info[blockIdx.x * ImInfoSize + 2]); + for (int i = threadIdx.x; i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * width; + i += BlockSize) { + int idx = lod[blockIdx.x] * width + i; + T im_size = (idx % 2 == 0) ? im_w : im_h; + output[idx] = max(min(input[idx], im_size - 1), T(0.)); + } +} + +template +class GPUBoxClipKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + auto *input = context.Input("Input"); + auto *im_info = context.Input("ImInfo"); + auto *output = context.Output("Output"); + const int64_t num = input->dims()[0]; + const int64_t bbox_width = input->numel() / num; + auto lod = input->lod(); + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + auto &dev_ctx = context.template device_context(); + auto stream = dev_ctx.stream(); + const size_t batch_size = lod.back().size() - 1; + T *output_data = output->mutable_data(dev_ctx.GetPlace()); + GPUBoxClip<<>>( + input->data(), abs_offset_lod[0].CUDAMutableData(dev_ctx.GetPlace()), + bbox_width, im_info->data(), output_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + box_clip, ops::GPUBoxClipKernel, + ops::GPUBoxClipKernel); diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h new file mode 100644 index 0000000000000000000000000000000000000000..74e1f88f8d8b28e490d170934760bd9bffc807bc --- /dev/null +++ b/paddle/fluid/operators/detection/box_clip_op.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/bbox_util.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class BoxClipKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input_box = context.Input("Input"); + auto* im_info = context.Input("ImInfo"); + auto* output_box = context.Output("Output"); + auto& dev_ctx = + context.template device_context(); + output_box->mutable_data(context.GetPlace()); + if (input_box->lod().size()) { + PADDLE_ENFORCE_EQ(input_box->lod().size(), 1UL, + "Only support 1 level of LoD."); + } + auto box_lod = input_box->lod().back(); + int64_t n = static_cast(box_lod.size() - 1); + for (int i = 0; i < n; ++i) { + Tensor im_info_slice = im_info->Slice(i, i + 1); + Tensor box_slice = input_box->Slice(box_lod[i], box_lod[i + 1]); + Tensor output_slice = output_box->Slice(box_lod[i], box_lod[i + 1]); + ClipTiledBoxes(dev_ctx, im_info_slice, box_slice, &output_slice); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc index 06fbb9815c52ea69e3aa9e893512e039853b9514..0a51d50e06176e713922837861f2102c9ee8a899 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cc +++ b/paddle/fluid/operators/detection/box_coder_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detection/box_coder_op.h" +#include namespace paddle { namespace operators { @@ -32,32 +33,49 @@ class BoxCoderOp : public framework::OperatorWithKernel { if (ctx->IsRuntime()) { PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2, - "The rank of Input of PriorBoxVar must be 2"); + "The rank of Input PriorBox must be 2"); PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]"); if (ctx->HasInput("PriorBoxVar")) { auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); - PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims); + PADDLE_ENFORCE(prior_box_var_dims.size() == 2, + "Input(PriorBoxVar) of BoxCoderOp should be 2."); + PADDLE_ENFORCE_EQ( + prior_box_dims, prior_box_var_dims, + "The dimension of Input(PriorBoxVar) should be equal to" + "the dimension of Input(PriorBox) when the rank is 2."); } + } - auto code_type = - GetBoxCodeType(ctx->Attrs().Get("code_type")); - if (code_type == BoxCodeType::kEncodeCenterSize) { - PADDLE_ENFORCE_EQ(target_box_dims.size(), 2, - "The rank of Input of TargetBox must be 2"); - PADDLE_ENFORCE_EQ(target_box_dims[1], 4, - "The shape of TargetBox is [M, 4]"); - } else if (code_type == BoxCodeType::kDecodeCenterSize) { - PADDLE_ENFORCE_EQ(target_box_dims.size(), 3, - "The rank of Input of TargetBox must be 3"); + auto code_type = GetBoxCodeType(ctx->Attrs().Get("code_type")); + int axis = ctx->Attrs().Get("axis"); + if (code_type == BoxCodeType::kEncodeCenterSize) { + PADDLE_ENFORCE_EQ(target_box_dims.size(), 2, + "The rank of Input TargetBox must be 2"); + PADDLE_ENFORCE_EQ(target_box_dims[1], 4, + "The shape of TargetBox is [M, 4]"); + ctx->SetOutputDim( + "OutputBox", + framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4})); + } else if (code_type == BoxCodeType::kDecodeCenterSize) { + PADDLE_ENFORCE_EQ(target_box_dims.size(), 3, + "The rank of Input TargetBox must be 3"); + if (axis == 0) { PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]); - PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]); + } else if (axis == 1) { + PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]); + } else { + PADDLE_THROW("axis must be 0 or 1."); } + PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]); + ctx->ShareDim("TargetBox", /*->*/ "OutputBox"); + } + + if (code_type == BoxCodeType::kDecodeCenterSize && axis == 1) { + ctx->ShareLoD("PriorBox", /*->*/ "OutputBox"); + } else { + ctx->ShareLoD("TargetBox", /*->*/ "OutputBox"); } - ctx->SetOutputDim( - "OutputBox", - framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4})); - ctx->ShareLoD("TargetBox", /*->*/ "OutputBox"); } }; @@ -100,6 +118,21 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default true) " "whether treat the priorbox as a noramlized box") .SetDefault(true); + AddAttr("axis", + "(int, default 0)" + "which axis in PriorBox to broadcast for box decode," + "for example, if axis is 0 and TargetBox has shape" + "[N, M, 4] and PriorBox has shape [M, 4], then PriorBox " + "will broadcast to [N, M, 4] for decoding. It is only valid" + "when code type is decode_center_size") + .SetDefault(0) + .InEnum({0, 1}); + AddAttr>( + "variance", + "(vector, default {})," + "variance of prior box with shape [4]. PriorBoxVar and variance can" + "not be provided at the same time.") + .SetDefault(std::vector{}); AddOutput("OutputBox", "(LoDTensor or Tensor) " "When code_type is 'encode_center_size', the output tensor of " @@ -138,7 +171,11 @@ where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the -encoded/decoded coordinates, width and height. +encoded/decoded coordinates, width and height. + +During Box Decoding, two modes for broadcast are supported. Say target box has +shape [N, M, 4], and the shape of prior box can be [N, 4] or [M, 4]. Then prior +box will broadcast to target box along the assigned axis. )DOC"); } }; diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu index a7af111f63d654319dd1d90d2032956951dfe49e..19a5bb90fa828899ad6270c051090dd3662aeed8 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cu +++ b/paddle/fluid/operators/detection/box_coder_op.cu @@ -9,6 +9,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include +#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/detection/box_coder_op.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -16,11 +19,11 @@ namespace paddle { namespace operators { template -__global__ void EncodeCenterSizeKernel(const T* prior_box_data, - const T* prior_box_var_data, - const T* target_box_data, const int row, - const int col, const int len, - const bool normalized, T* output) { +__global__ void EncodeCenterSizeKernel( + const T* prior_box_data, const T* prior_box_var_data, + const T* target_box_data, const int row, const int col, const int len, + const bool normalized, const T prior_box_var_size, const float* variance, + const int var_size, T* output) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < row * col) { const int row_idx = idx / col; @@ -30,11 +33,9 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data, T prior_box_height = prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1] + (normalized == false); - T prior_box_center_x = - (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2; - T prior_box_center_y = (prior_box_data[col_idx * len + 3] + - prior_box_data[col_idx * len + 1]) / - 2; + T prior_box_center_x = prior_box_data[col_idx * len] + prior_box_width / 2; + T prior_box_center_y = + prior_box_data[col_idx * len + 1] + prior_box_height / 2; T target_box_center_x = (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) / @@ -55,58 +56,67 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data, output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)); output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)); if (prior_box_var_data) { - output[idx * len] /= prior_box_var_data[col_idx * len]; - output[idx * len + 1] /= prior_box_var_data[col_idx * len + 1]; - output[idx * len + 2] /= prior_box_var_data[col_idx * len + 2]; - output[idx * len + 3] /= prior_box_var_data[col_idx * len + 3]; + int prior_var_offset = col_idx * len; + output[idx * len] /= prior_box_var_data[prior_var_offset]; + output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1]; + output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2]; + output[idx * len + 3] /= prior_box_var_data[prior_var_offset + 3]; + } else if (var_size == 4) { + for (int k = 0; k < 4; ++k) { + output[idx * len + k] /= static_cast(variance[k]); + } } } } template -__global__ void DecodeCenterSizeKernel(const T* prior_box_data, - const T* prior_box_var_data, - const T* target_box_data, const int row, - const int col, const int len, - const bool normalized, T* output) { +__global__ void DecodeCenterSizeKernel( + const T* prior_box_data, const T* prior_box_var_data, + const T* target_box_data, const int row, const int col, const int len, + const bool normalized, const T prior_box_var_size, const float* variance, + const int var_size, const int axis, T* output) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; + int prior_box_offset = 0; if (idx < row * col) { const int col_idx = idx % col; - T prior_box_width = prior_box_data[col_idx * len + 2] - - prior_box_data[col_idx * len] + (normalized == false); - T prior_box_height = prior_box_data[col_idx * len + 3] - - prior_box_data[col_idx * len + 1] + + const int row_idx = idx / col; + prior_box_offset = axis == 0 ? col_idx * len : row_idx * len; + T prior_box_width = prior_box_data[prior_box_offset + 2] - + prior_box_data[prior_box_offset] + + (normalized == false); + T prior_box_height = prior_box_data[prior_box_offset + 3] - + prior_box_data[prior_box_offset + 1] + (normalized == false); T prior_box_center_x = - (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2; - T prior_box_center_y = (prior_box_data[col_idx * len + 3] + - prior_box_data[col_idx * len + 1]) / - 2; + prior_box_data[prior_box_offset] + prior_box_width / 2; + T prior_box_center_y = + prior_box_data[prior_box_offset + 1] + prior_box_height / 2; T target_box_width, target_box_height; T target_box_center_x, target_box_center_y; + T box_var_x = T(1), box_var_y = T(1); + T box_var_w = T(1), box_var_h = T(1); if (prior_box_var_data) { - target_box_width = exp(prior_box_var_data[col_idx * len + 2] * - target_box_data[idx * len + 2]) * - prior_box_width; - target_box_height = exp(prior_box_var_data[col_idx * len + 3] * - target_box_data[idx * len + 3]) * - prior_box_height; - target_box_center_x = prior_box_var_data[col_idx * len] * - target_box_data[idx * len] * prior_box_width + - prior_box_center_x; - target_box_center_y = prior_box_var_data[col_idx * len + 1] * - target_box_data[idx * len + 1] * - prior_box_height + - prior_box_center_y; - } else { - target_box_width = exp(target_box_data[idx * len + 2]) * prior_box_width; - target_box_height = - exp(target_box_data[idx * len + 3]) * prior_box_height; - target_box_center_x = - target_box_data[idx * len] * prior_box_width + prior_box_center_x; - target_box_center_y = target_box_data[idx * len + 1] * prior_box_height + - prior_box_center_y; + int prior_var_offset = axis == 0 ? col_idx * len : row_idx * len; + box_var_x = prior_box_var_data[prior_var_offset]; + box_var_y = prior_box_var_data[prior_var_offset + 1]; + box_var_w = prior_box_var_data[prior_var_offset + 2]; + box_var_h = prior_box_var_data[prior_var_offset + 3]; + } else if (var_size == 4) { + box_var_x = static_cast(variance[0]); + box_var_y = static_cast(variance[1]); + box_var_w = static_cast(variance[2]); + box_var_h = static_cast(variance[3]); } + target_box_width = + exp(box_var_w * target_box_data[idx * len + 2]) * prior_box_width; + target_box_height = + exp(box_var_h * target_box_data[idx * len + 3]) * prior_box_height; + target_box_center_x = + box_var_x * target_box_data[idx * len] * prior_box_width + + prior_box_center_x; + target_box_center_y = + box_var_y * target_box_data[idx * len + 1] * prior_box_height + + prior_box_center_y; output[idx * len] = target_box_center_x - target_box_width / 2; output[idx * len + 1] = target_box_center_y - target_box_height / 2; @@ -127,36 +137,64 @@ class BoxCoderCUDAKernel : public framework::OpKernel { auto* prior_box_var = context.Input("PriorBoxVar"); auto* target_box = context.Input("TargetBox"); auto* output_box = context.Output("OutputBox"); - + std::vector variance = context.Attr>("variance"); const T* prior_box_data = prior_box->data(); const T* target_box_data = target_box->data(); const T* prior_box_var_data = nullptr; - if (prior_box_var) prior_box_var_data = prior_box_var->data(); + auto prior_box_var_size = 0; + if (prior_box_var) { + PADDLE_ENFORCE(variance.empty(), + "Input 'PriorBoxVar' and attribute 'variance' should not" + "be used at the same time."); + prior_box_var_data = prior_box_var->data(); + prior_box_var_size = prior_box_var->dims().size(); + } + if (!(variance.empty())) { + PADDLE_ENFORCE(static_cast(variance.size()) == 4, + "Size of attribute 'variance' should be 4"); + } if (target_box->lod().size()) { PADDLE_ENFORCE_EQ(target_box->lod().size(), 1, "Only support 1 level of LoD."); } + const int var_size = static_cast(variance.size()); + + auto code_type = GetBoxCodeType(context.Attr("code_type")); + bool normalized = context.Attr("box_normalized"); + int axis = context.Attr("axis"); + auto row = target_box->dims()[0]; auto col = prior_box->dims()[0]; + if (code_type == BoxCodeType::kDecodeCenterSize) { + col = target_box->dims()[1]; + } auto len = prior_box->dims()[1]; int block = 512; int grid = (row * col + block - 1) / block; auto& device_ctx = context.cuda_device_context(); + auto& allocator = + platform::DeviceTemporaryAllocator::Instance().Get(device_ctx); + int bytes = var_size * sizeof(float); + auto dev_var = allocator.Allocate(bytes); + float* dev_var_data = reinterpret_cast(dev_var->ptr()); + auto cplace = platform::CPUPlace(); + const auto gplace = boost::get(context.GetPlace()); + memory::Copy(gplace, dev_var_data, cplace, &variance[0], bytes, + device_ctx.stream()); + output_box->mutable_data({row, col, len}, context.GetPlace()); T* output = output_box->data(); - auto code_type = GetBoxCodeType(context.Attr("code_type")); - bool normalized = context.Attr("box_normalized"); if (code_type == BoxCodeType::kEncodeCenterSize) { EncodeCenterSizeKernel<<>>( prior_box_data, prior_box_var_data, target_box_data, row, col, len, - normalized, output); + normalized, prior_box_var_size, dev_var_data, var_size, output); } else if (code_type == BoxCodeType::kDecodeCenterSize) { DecodeCenterSizeKernel<<>>( prior_box_data, prior_box_var_data, target_box_data, row, col, len, - normalized, output); + normalized, prior_box_var_size, dev_var_data, var_size, axis, output); } } }; diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h index b2a2bcdce932032a761a1fc064fe622f7629f9bf..6d406f8196f9964c85bb94541fa7a7a23857539b 100644 --- a/paddle/fluid/operators/detection/box_coder_op.h +++ b/paddle/fluid/operators/detection/box_coder_op.h @@ -11,6 +11,7 @@ limitations under the License. */ #pragma once #include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" @@ -34,7 +35,8 @@ class BoxCoderKernel : public framework::OpKernel { void EncodeCenterSize(const framework::Tensor* target_box, const framework::Tensor* prior_box, const framework::Tensor* prior_box_var, - const bool normalized, T* output) const { + const bool normalized, + const std::vector variance, T* output) const { int64_t row = target_box->dims()[0]; int64_t col = prior_box->dims()[0]; int64_t len = prior_box->dims()[1]; @@ -53,10 +55,9 @@ class BoxCoderKernel : public framework::OpKernel { T prior_box_height = prior_box_data[j * len + 3] - prior_box_data[j * len + 1] + (normalized == false); - T prior_box_center_x = - (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; + T prior_box_center_x = prior_box_data[j * len] + prior_box_width / 2; T prior_box_center_y = - (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; + prior_box_data[j * len + 1] + prior_box_height / 2; T target_box_center_x = (target_box_data[i * len + 2] + target_box_data[i * len]) / 2; @@ -78,69 +79,78 @@ class BoxCoderKernel : public framework::OpKernel { output[offset + 3] = std::log(std::fabs(target_box_height / prior_box_height)); if (prior_box_var) { - output[offset] /= prior_box_var_data[j * len]; - output[offset + 1] /= prior_box_var_data[j * len + 1]; - output[offset + 2] /= prior_box_var_data[j * len + 2]; - output[offset + 3] /= prior_box_var_data[j * len + 3]; + int prior_var_offset = j * len; + output[offset] /= prior_box_var_data[prior_var_offset]; + output[offset + 1] /= prior_box_var_data[prior_var_offset + 1]; + output[offset + 2] /= prior_box_var_data[prior_var_offset + 2]; + output[offset + 3] /= prior_box_var_data[prior_var_offset + 3]; + } else if (!(variance.empty())) { + for (int k = 0; k < 4; ++k) { + output[offset + k] /= static_cast(variance[k]); + } } } } } + template void DecodeCenterSize(const framework::Tensor* target_box, const framework::Tensor* prior_box, const framework::Tensor* prior_box_var, - const bool normalized, T* output) const { + const bool normalized, std::vector variance, + T* output) const { int64_t row = target_box->dims()[0]; - int64_t col = prior_box->dims()[0]; - int64_t len = prior_box->dims()[1]; + int64_t col = target_box->dims()[1]; + int64_t len = target_box->dims()[2]; auto* target_box_data = target_box->data(); auto* prior_box_data = prior_box->data(); const T* prior_box_var_data = nullptr; - if (prior_box_var) prior_box_var_data = prior_box_var->data(); - + if (var_size == 2) prior_box_var_data = prior_box_var->data(); + int prior_box_offset = 0; + T var_data[4] = {1., 1., 1., 1.}; + T* var_ptr = var_data; #ifdef PADDLE_WITH_MKLML #pragma omp parallel for collapse(2) #endif for (int64_t i = 0; i < row; ++i) { for (int64_t j = 0; j < col; ++j) { size_t offset = i * col * len + j * len; - T prior_box_width = prior_box_data[j * len + 2] - - prior_box_data[j * len] + (normalized == false); - T prior_box_height = prior_box_data[j * len + 3] - - prior_box_data[j * len + 1] + + prior_box_offset = axis == 0 ? j * len : i * len; + T prior_box_width = prior_box_data[prior_box_offset + 2] - + prior_box_data[prior_box_offset] + + (normalized == false); + T prior_box_height = prior_box_data[prior_box_offset + 3] - + prior_box_data[prior_box_offset + 1] + (normalized == false); T prior_box_center_x = - (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; + prior_box_data[prior_box_offset] + prior_box_width / 2; T prior_box_center_y = - (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; + prior_box_data[prior_box_offset + 1] + prior_box_height / 2; T target_box_center_x = 0, target_box_center_y = 0; T target_box_width = 0, target_box_height = 0; - if (prior_box_var) { - target_box_center_x = prior_box_var_data[j * len] * - target_box_data[offset] * prior_box_width + - prior_box_center_x; - target_box_center_y = prior_box_var_data[j * len + 1] * - target_box_data[offset + 1] * - prior_box_height + - prior_box_center_y; - target_box_width = std::exp(prior_box_var_data[j * len + 2] * - target_box_data[offset + 2]) * - prior_box_width; - target_box_height = std::exp(prior_box_var_data[j * len + 3] * - target_box_data[offset + 3]) * - prior_box_height; - } else { - target_box_center_x = - target_box_data[offset] * prior_box_width + prior_box_center_x; - target_box_center_y = target_box_data[offset + 1] * prior_box_height + - prior_box_center_y; - target_box_width = - std::exp(target_box_data[offset + 2]) * prior_box_width; - target_box_height = - std::exp(target_box_data[offset + 3]) * prior_box_height; + int prior_var_offset = axis == 0 ? j * len : i * len; + if (var_size == 2) { + std::memcpy(var_ptr, prior_box_var_data + prior_var_offset, + 4 * sizeof(T)); + } else if (var_size == 1) { + var_ptr = reinterpret_cast(variance.data()); } + T box_var_x = *var_ptr; + T box_var_y = *(var_ptr + 1); + T box_var_w = *(var_ptr + 2); + T box_var_h = *(var_ptr + 3); + + target_box_center_x = + box_var_x * target_box_data[offset] * prior_box_width + + prior_box_center_x; + target_box_center_y = + box_var_y * target_box_data[offset + 1] * prior_box_height + + prior_box_center_y; + target_box_width = + std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width; + target_box_height = std::exp(box_var_h * target_box_data[offset + 3]) * + prior_box_height; output[offset] = target_box_center_x - target_box_width / 2; output[offset + 1] = target_box_center_y - target_box_height / 2; @@ -157,26 +167,63 @@ class BoxCoderKernel : public framework::OpKernel { auto* prior_box_var = context.Input("PriorBoxVar"); auto* target_box = context.Input("TargetBox"); auto* output_box = context.Output("OutputBox"); - + std::vector variance = context.Attr>("variance"); + const int axis = context.Attr("axis"); if (target_box->lod().size()) { PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL, "Only support 1 level of LoD."); } + if (prior_box_var) { + PADDLE_ENFORCE(variance.empty(), + "Input 'PriorBoxVar' and attribute 'variance' should not" + "be used at the same time."); + } + if (!(variance.empty())) { + PADDLE_ENFORCE(static_cast(variance.size()) == 4, + "Size of attribute 'variance' should be 4"); + } + auto code_type = GetBoxCodeType(context.Attr("code_type")); + bool normalized = context.Attr("box_normalized"); + auto row = target_box->dims()[0]; auto col = prior_box->dims()[0]; + if (code_type == BoxCodeType::kDecodeCenterSize) { + col = target_box->dims()[1]; + } auto len = prior_box->dims()[1]; output_box->mutable_data({row, col, len}, context.GetPlace()); - auto code_type = GetBoxCodeType(context.Attr("code_type")); - bool normalized = context.Attr("box_normalized"); T* output = output_box->data(); if (code_type == BoxCodeType::kEncodeCenterSize) { EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, - output); + variance, output); } else if (code_type == BoxCodeType::kDecodeCenterSize) { - DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, - output); + if (prior_box_var) { + if (axis == 0) { + DecodeCenterSize<0, 2>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 2>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } else if (!(variance.empty())) { + if (axis == 0) { + DecodeCenterSize<0, 1>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 1>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } else { + if (axis == 0) { + DecodeCenterSize<0, 0>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 0>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } } } }; diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h index ed2f5df80cf4d7a5a44af9b09f3b048b1b14cdb9..3591681fc3f6951dfc8d73e8edce38180b771eaf 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.h +++ b/paddle/fluid/operators/detection/density_prior_box_op.h @@ -52,6 +52,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { step_height = step_h; } int num_priors = 0; + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for reduction(+ : num_priors) +#endif for (size_t i = 0; i < densities.size(); ++i) { num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); } @@ -64,6 +68,17 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { auto e_boxes = framework::EigenTensor::From(*boxes).setConstant(0.0); int step_average = static_cast((step_width + step_height) * 0.5); + std::vector sqrt_fixed_ratios; +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int i = 0; i < fixed_ratios.size(); i++) { + sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i])); + } + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(2) +#endif for (int h = 0; h < feature_height; ++h) { for (int w = 0; w < feature_width; ++w) { T center_x = (w + offset) * step_width; @@ -73,34 +88,25 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { for (size_t s = 0; s < fixed_sizes.size(); ++s) { auto fixed_size = fixed_sizes[s]; int density = densities[s]; + int shift = step_average / density; // Generate density prior boxes with fixed ratios. for (size_t r = 0; r < fixed_ratios.size(); ++r) { - float ar = fixed_ratios[r]; - int shift = step_average / density; - float box_width_ratio = fixed_size * sqrt(ar); - float box_height_ratio = fixed_size / sqrt(ar); + float box_width_ratio = fixed_size * sqrt_fixed_ratios[r]; + float box_height_ratio = fixed_size / sqrt_fixed_ratios[r]; + float density_center_x = center_x - step_average / 2. + shift / 2.; + float density_center_y = center_y - step_average / 2. + shift / 2.; for (int di = 0; di < density; ++di) { for (int dj = 0; dj < density; ++dj) { - float center_x_temp = - center_x - step_average / 2. + shift / 2. + dj * shift; - float center_y_temp = - center_y - step_average / 2. + shift / 2. + di * shift; - e_boxes(h, w, idx, 0) = - (center_x_temp - box_width_ratio / 2.) / img_width >= 0 - ? (center_x_temp - box_width_ratio / 2.) / img_width - : 0; - e_boxes(h, w, idx, 1) = - (center_y_temp - box_height_ratio / 2.) / img_height >= 0 - ? (center_y_temp - box_height_ratio / 2.) / img_height - : 0; - e_boxes(h, w, idx, 2) = - (center_x_temp + box_width_ratio / 2.) / img_width <= 1 - ? (center_x_temp + box_width_ratio / 2.) / img_width - : 1; - e_boxes(h, w, idx, 3) = - (center_y_temp + box_height_ratio / 2.) / img_height <= 1 - ? (center_y_temp + box_height_ratio / 2.) / img_height - : 1; + float center_x_temp = density_center_x + dj * shift; + float center_y_temp = density_center_y + di * shift; + e_boxes(h, w, idx, 0) = std::max( + (center_x_temp - box_width_ratio / 2.) / img_width, 0.); + e_boxes(h, w, idx, 1) = std::max( + (center_y_temp - box_height_ratio / 2.) / img_height, 0.); + e_boxes(h, w, idx, 2) = std::min( + (center_x_temp + box_width_ratio / 2.) / img_width, 1.); + e_boxes(h, w, idx, 3) = std::min( + (center_y_temp + box_height_ratio / 2.) / img_height, 1.); idx++; } } @@ -131,8 +137,14 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { vars->Resize({box_num, static_cast(variances.size())}); auto e_vars = framework::EigenMatrix::From(*vars); - - e_vars = var_et.broadcast(Eigen::DSizes(box_num, 1)); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(2) +#endif + for (int i = 0; i < box_num; ++i) { + for (int j = 0; j < variances.size(); ++j) { + e_vars(i, j) = variances[j]; + } + } vars->Resize(var_dim); boxes->Resize(box_dim); diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc index 2395b181485429784e0f3dff6d056b84268ef245..f357e3ccf905309e6656f3fa87fbee45dc357c1e 100644 --- a/paddle/fluid/operators/detection/multiclass_nms_op.cc +++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc @@ -9,9 +9,9 @@ http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - limitations under the License. */ +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detection/poly_util.h" @@ -35,30 +35,45 @@ class MultiClassNMSOp : public framework::OperatorWithKernel { auto box_dims = ctx->GetInputDim("BBoxes"); auto score_dims = ctx->GetInputDim("Scores"); + auto score_size = score_dims.size(); if (ctx->IsRuntime()) { + PADDLE_ENFORCE(score_size == 2 || score_size == 3, + "The rank of Input(Scores) must be 2 or 3"); PADDLE_ENFORCE_EQ(box_dims.size(), 3, - "The rank of Input(BBoxes) must be 3."); - PADDLE_ENFORCE_EQ(score_dims.size(), 3, - "The rank of Input(Scores) must be 3."); - PADDLE_ENFORCE(box_dims[2] == 4 || box_dims[2] == 8 || - box_dims[2] == 16 || box_dims[2] == 24 || - box_dims[2] == 32, - "The 2nd dimension of Input(BBoxes) must be 4 or 8, " - "represents the layout of coordinate " - "[xmin, ymin, xmax, ymax] or " - "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or " - "8 points: [xi, yi] i= 1,2,...,8 or " - "12 points: [xi, yi] i= 1,2,...,12 or " - "16 points: [xi, yi] i= 1,2,...,16"); - PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2], - "The 1st dimensiong of Input(BBoxes) must be equal to " - "3rd dimension of Input(Scores), which represents the " - "predicted bboxes."); + "The rank of Input(BBoxes) must be 3"); + if (score_size == 3) { + PADDLE_ENFORCE(box_dims[2] == 4 || box_dims[2] == 8 || + box_dims[2] == 16 || box_dims[2] == 24 || + box_dims[2] == 32, + "The last dimension of Input(BBoxes) must be 4 or 8, " + "represents the layout of coordinate " + "[xmin, ymin, xmax, ymax] or " + "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or " + "8 points: [xi, yi] i= 1,2,...,8 or " + "12 points: [xi, yi] i= 1,2,...,12 or " + "16 points: [xi, yi] i= 1,2,...,16"); + PADDLE_ENFORCE_EQ( + box_dims[1], score_dims[2], + "The 2nd dimension of Input(BBoxes) must be equal to " + "last dimension of Input(Scores), which represents the " + "predicted bboxes."); + } else { + PADDLE_ENFORCE(box_dims[2] == 4, + "The last dimension of Input(BBoxes) must be 4"); + PADDLE_ENFORCE_EQ(box_dims[1], score_dims[1], + "The 2nd dimension of Input(BBoxes)" + "must be equal to the 2nd dimension" + " of Input(Scores)"); + } } // Here the box_dims[0] is not the real dimension of output. // It will be rewritten in the computing kernel. - ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2}); + if (score_size == 3) { + ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2}); + } else { + ctx->SetOutputDim("Out", {-1, box_dims[2] + 2}); + } } protected: @@ -123,8 +138,9 @@ static inline T JaccardOverlap(const T* box1, const T* box2, const T inter_ymin = std::max(box1[1], box2[1]); const T inter_xmax = std::min(box1[2], box2[2]); const T inter_ymax = std::min(box1[3], box2[3]); - const T inter_w = inter_xmax - inter_xmin; - const T inter_h = inter_ymax - inter_ymin; + T norm = normalized ? static_cast(0.) : static_cast(1.); + T inter_w = inter_xmax - inter_xmin + norm; + T inter_h = inter_ymax - inter_ymin + norm; const T inter_area = inter_w * inter_h; const T bbox1_area = BBoxArea(box1, normalized); const T bbox2_area = BBoxArea(box2, normalized); @@ -139,7 +155,7 @@ T PolyIoU(const T* box1, const T* box2, const size_t box_size, T bbox2_area = PolyArea(box2, box_size, normalized); T inter_area = PolyOverlapArea(box1, box2, box_size, normalized); if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) { - // If coordinate values are is invalid + // If coordinate values are invalid // if area size <= 0, return 0. return T(0.); } else { @@ -147,12 +163,35 @@ T PolyIoU(const T* box1, const T* box2, const size_t box_size, } } +template +void SliceOneClass(const platform::DeviceContext& ctx, + const framework::Tensor& items, const int class_id, + framework::Tensor* one_class_item) { + T* item_data = one_class_item->mutable_data(ctx.GetPlace()); + const T* items_data = items.data(); + const int64_t num_item = items.dims()[0]; + const int class_num = items.dims()[1]; + if (items.dims().size() == 3) { + int item_size = items.dims()[2]; + for (int i = 0; i < num_item; ++i) { + std::memcpy(item_data + i * item_size, + items_data + i * class_num * item_size + class_id * item_size, + sizeof(T) * item_size); + } + } else { + for (int i = 0; i < num_item; ++i) { + item_data[i] = items_data[i * class_num + class_id]; + } + } +} + template class MultiClassNMSKernel : public framework::OpKernel { public: void NMSFast(const Tensor& bbox, const Tensor& scores, const T score_threshold, const T nms_threshold, const T eta, - const int64_t top_k, std::vector* selected_indices) const { + const int64_t top_k, std::vector* selected_indices, + const bool normalized) const { // The total boxes for each instance. int64_t num_boxes = bbox.dims()[0]; // 4: [xmin ymin xmax ymax] @@ -178,15 +217,16 @@ class MultiClassNMSKernel : public framework::OpKernel { T overlap = T(0.); // 4: [xmin ymin xmax ymax] if (box_size == 4) { - overlap = JaccardOverlap(bbox_data + idx * box_size, - bbox_data + kept_idx * box_size, true); + overlap = + JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, normalized); } // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32 if (box_size == 8 || box_size == 16 || box_size == 24 || box_size == 32) { - overlap = - PolyIoU(bbox_data + idx * box_size, - bbox_data + kept_idx * box_size, box_size, true); + overlap = PolyIoU(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, box_size, + normalized); } keep = overlap <= adaptive_threshold; } else { @@ -205,37 +245,58 @@ class MultiClassNMSKernel : public framework::OpKernel { void MultiClassNMS(const framework::ExecutionContext& ctx, const Tensor& scores, const Tensor& bboxes, + const int scores_size, std::map>* indices, int* num_nmsed_out) const { int64_t background_label = ctx.Attr("background_label"); int64_t nms_top_k = ctx.Attr("nms_top_k"); int64_t keep_top_k = ctx.Attr("keep_top_k"); + bool normalized = ctx.Attr("normalized"); T nms_threshold = static_cast(ctx.Attr("nms_threshold")); T nms_eta = static_cast(ctx.Attr("nms_eta")); T score_threshold = static_cast(ctx.Attr("score_threshold")); + auto& dev_ctx = ctx.template device_context(); - int64_t class_num = scores.dims()[0]; - int64_t predict_dim = scores.dims()[1]; int num_det = 0; + + int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1]; + Tensor bbox_slice, score_slice; for (int64_t c = 0; c < class_num; ++c) { if (c == background_label) continue; - Tensor score = scores.Slice(c, c + 1); - NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k, - &((*indices)[c])); + if (scores_size == 3) { + score_slice = scores.Slice(c, c + 1); + bbox_slice = bboxes; + } else { + score_slice.Resize({scores.dims()[0], 1}); + bbox_slice.Resize({scores.dims()[0], 4}); + SliceOneClass(dev_ctx, scores, c, &score_slice); + SliceOneClass(dev_ctx, bboxes, c, &bbox_slice); + } + NMSFast(bbox_slice, score_slice, score_threshold, nms_threshold, nms_eta, + nms_top_k, &((*indices)[c]), normalized); + if (scores_size == 2) { + std::stable_sort((*indices)[c].begin(), (*indices)[c].end()); + } num_det += (*indices)[c].size(); } *num_nmsed_out = num_det; const T* scores_data = scores.data(); if (keep_top_k > -1 && num_det > keep_top_k) { + const T* sdata; std::vector>> score_index_pairs; for (const auto& it : *indices) { int label = it.first; - const T* sdata = scores_data + label * predict_dim; + if (scores_size == 3) { + sdata = scores_data + label * scores.dims()[1]; + } else { + score_slice.Resize({scores.dims()[0], 1}); + SliceOneClass(dev_ctx, scores, label, &score_slice); + sdata = score_slice.data(); + } const std::vector& label_indices = it.second; for (size_t j = 0; j < label_indices.size(); ++j) { int idx = label_indices[j]; - PADDLE_ENFORCE_LT(idx, predict_dim); score_index_pairs.push_back( std::make_pair(sdata[idx], std::make_pair(label, idx))); } @@ -252,31 +313,55 @@ class MultiClassNMSKernel : public framework::OpKernel { int idx = score_index_pairs[j].second.second; new_indices[label].push_back(idx); } + if (scores_size == 2) { + for (const auto& it : new_indices) { + int label = it.first; + std::stable_sort(new_indices[label].begin(), + new_indices[label].end()); + } + } new_indices.swap(*indices); *num_nmsed_out = keep_top_k; } } - void MultiClassOutput(const Tensor& scores, const Tensor& bboxes, + void MultiClassOutput(const platform::DeviceContext& ctx, + const Tensor& scores, const Tensor& bboxes, const std::map>& selected_indices, - Tensor* outs) const { + const int scores_size, Tensor* outs) const { + int64_t class_num = scores.dims()[1]; int64_t predict_dim = scores.dims()[1]; int64_t box_size = bboxes.dims()[1]; - int64_t out_dim = bboxes.dims()[1] + 2; + if (scores_size == 2) { + box_size = bboxes.dims()[2]; + } + int64_t out_dim = box_size + 2; auto* scores_data = scores.data(); auto* bboxes_data = bboxes.data(); auto* odata = outs->data(); - + const T* sdata; + Tensor bbox; + bbox.Resize({scores.dims()[0], box_size}); int count = 0; for (const auto& it : selected_indices) { int label = it.first; - const T* sdata = scores_data + label * predict_dim; const std::vector& indices = it.second; + if (scores_size == 2) { + SliceOneClass(ctx, bboxes, label, &bbox); + } else { + sdata = scores_data + label * predict_dim; + } for (size_t j = 0; j < indices.size(); ++j) { int idx = indices[j]; - const T* bdata = bboxes_data + idx * box_size; - odata[count * out_dim] = label; // label - odata[count * out_dim + 1] = sdata[idx]; // score + odata[count * out_dim] = label; // label + const T* bdata; + if (scores_size == 3) { + bdata = bboxes_data + idx * box_size; + odata[count * out_dim + 1] = sdata[idx]; // score + } else { + bdata = bbox.data() + idx * box_size; + odata[count * out_dim + 1] = *(scores_data + idx * class_num + label); + } // xmin, ymin, xmax, ymax or multi-points coordinates std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T)); count++; @@ -285,52 +370,64 @@ class MultiClassNMSKernel : public framework::OpKernel { } void Compute(const framework::ExecutionContext& ctx) const override { - auto* boxes = ctx.Input("BBoxes"); - auto* scores = ctx.Input("Scores"); + auto* boxes = ctx.Input("BBoxes"); + auto* scores = ctx.Input("Scores"); auto* outs = ctx.Output("Out"); auto score_dims = scores->dims(); - - int64_t batch_size = score_dims[0]; - int64_t class_num = score_dims[1]; - int64_t predict_dim = score_dims[2]; - int64_t box_dim = boxes->dims()[2]; - int64_t out_dim = boxes->dims()[2] + 2; + auto score_size = score_dims.size(); + auto& dev_ctx = ctx.template device_context(); std::vector>> all_indices; std::vector batch_starts = {0}; - for (int64_t i = 0; i < batch_size; ++i) { - Tensor ins_score = scores->Slice(i, i + 1); - ins_score.Resize({class_num, predict_dim}); - - Tensor ins_boxes = boxes->Slice(i, i + 1); - ins_boxes.Resize({predict_dim, box_dim}); - + int64_t batch_size = score_dims[0]; + int64_t box_dim = boxes->dims()[2]; + int64_t out_dim = box_dim + 2; + int num_nmsed_out = 0; + Tensor boxes_slice, scores_slice; + int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1; + for (int i = 0; i < n; ++i) { + if (score_size == 3) { + scores_slice = scores->Slice(i, i + 1); + scores_slice.Resize({score_dims[1], score_dims[2]}); + boxes_slice = boxes->Slice(i, i + 1); + boxes_slice.Resize({score_dims[2], box_dim}); + } else { + auto boxes_lod = boxes->lod().back(); + scores_slice = scores->Slice(boxes_lod[i], boxes_lod[i + 1]); + boxes_slice = boxes->Slice(boxes_lod[i], boxes_lod[i + 1]); + } std::map> indices; - int num_nmsed_out = 0; - MultiClassNMS(ctx, ins_score, ins_boxes, &indices, &num_nmsed_out); + MultiClassNMS(ctx, scores_slice, boxes_slice, score_size, &indices, + &num_nmsed_out); all_indices.push_back(indices); batch_starts.push_back(batch_starts.back() + num_nmsed_out); } int num_kept = batch_starts.back(); if (num_kept == 0) { - T* od = outs->mutable_data({1}, ctx.GetPlace()); + T* od = outs->mutable_data({1, 1}, ctx.GetPlace()); od[0] = -1; + batch_starts = {0, 1}; } else { outs->mutable_data({num_kept, out_dim}, ctx.GetPlace()); - for (int64_t i = 0; i < batch_size; ++i) { - Tensor ins_score = scores->Slice(i, i + 1); - ins_score.Resize({class_num, predict_dim}); - - Tensor ins_boxes = boxes->Slice(i, i + 1); - ins_boxes.Resize({predict_dim, box_dim}); - + for (int i = 0; i < n; ++i) { + if (score_size == 3) { + scores_slice = scores->Slice(i, i + 1); + boxes_slice = boxes->Slice(i, i + 1); + scores_slice.Resize({score_dims[1], score_dims[2]}); + boxes_slice.Resize({score_dims[2], box_dim}); + } else { + auto boxes_lod = boxes->lod().back(); + scores_slice = scores->Slice(boxes_lod[i], boxes_lod[i + 1]); + boxes_slice = boxes->Slice(boxes_lod[i], boxes_lod[i + 1]); + } int64_t s = batch_starts[i]; int64_t e = batch_starts[i + 1]; if (e > s) { Tensor out = outs->Slice(s, e); - MultiClassOutput(ins_score, ins_boxes, all_indices[i], &out); + MultiClassOutput(dev_ctx, scores_slice, boxes_slice, all_indices[i], + score_dims.size(), &out); } } } @@ -346,17 +443,24 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("BBoxes", - "(Tensor) A 3-D Tensor with shape " + "Two types of bboxes are supported:" + "1. (Tensor) A 3-D Tensor with shape " "[N, M, 4 or 8 16 24 32] represents the " "predicted locations of M bounding bboxes, N is the batch size. " "Each bounding box has four coordinate values and the layout is " - "[xmin, ymin, xmax, ymax], when box size equals to 4."); + "[xmin, ymin, xmax, ymax], when box size equals to 4." + "2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]" + "M is the number of bounding boxes, C is the class number"); AddInput("Scores", - "(Tensor) A 3-D Tensor with shape [N, C, M] represents the " + "Two types of scores are supported:" + "1. (Tensor) A 3-D Tensor with shape [N, C, M] represents the " "predicted confidence predictions. N is the batch size, C is the " "class number, M is number of bounding boxes. For each category " "there are total M scores which corresponding M bounding boxes. " - " Please note, M is equal to the 1st dimension of BBoxes. "); + " Please note, M is equal to the 2nd dimension of BBoxes. " + "2. (LoDTensor) A 2-D LoDTensor with shape [M, C]. " + "M is the number of bbox, C is the class number. In this case, " + "Input BBoxes should be the second case with shape [M, C, 4]."); AddAttr( "background_label", "(int, defalut: 0) " @@ -384,6 +488,10 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker { "(int64_t) " "Number of total bboxes to be kept per image after NMS " "step. -1 means keeping all bboxes after NMS step."); + AddAttr("normalized", + "(bool, default true) " + "Whether detections are normalized.") + .SetDefault(true); AddOutput("Out", "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the " "detections. Each row has 6 values: " @@ -399,24 +507,21 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( This operator is to do multi-class non maximum suppression (NMS) on a batched of boxes and scores. - In the NMS step, this operator greedily selects a subset of detection bounding boxes that have high scores larger than score_threshold, if providing this threshold, then selects the largest nms_top_k confidences scores if nms_top_k is larger than -1. Then this operator pruns away boxes that have high IOU (intersection over union) overlap with already selected boxes by adaptive threshold NMS based on parameters of nms_threshold and nms_eta. - Aftern NMS step, at most keep_top_k number of total bboxes are to be kept per image if keep_top_k is larger than -1. - This operator support multi-class and batched inputs. It applying NMS independently for each class. The outputs is a 2-D LoDTenosr, for each image, the offsets in first dimension of LoDTensor are called LoD, the number of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0, means there is no detected bbox for this image. If there is no detected boxes -for all images, all the elements in LoD are 0, and the Out only contains one -value which is -1. +for all images, all the elements in LoD are set to {1}, and the Out only +contains one value which is -1. )DOC"); } }; diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc similarity index 69% rename from paddle/fluid/operators/yolov3_loss_op.cc rename to paddle/fluid/operators/detection/yolov3_loss_op.cc index 60508f7ab871910c38f1e4aa04c2035075d37df5..2a69ad4b53c26f5e2e0547e75e0d9c6518a8bcba 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cc +++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc @@ -9,7 +9,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/yolov3_loss_op.h" +#include "paddle/fluid/operators/detection/yolov3_loss_op.h" #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -29,23 +29,33 @@ class Yolov3LossOp : public framework::OperatorWithKernel { "Input(GTLabel) of Yolov3LossOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Loss"), "Output(Loss) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("ObjectnessMask"), + "Output(ObjectnessMask) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("GTMatchMask"), + "Output(GTMatchMask) of Yolov3LossOp should not be null."); auto dim_x = ctx->GetInputDim("X"); auto dim_gtbox = ctx->GetInputDim("GTBox"); auto dim_gtlabel = ctx->GetInputDim("GTLabel"); auto anchors = ctx->Attrs().Get>("anchors"); + int anchor_num = anchors.size() / 2; + auto anchor_mask = ctx->Attrs().Get>("anchor_mask"); + int mask_num = anchor_mask.size(); auto class_num = ctx->Attrs().Get("class_num"); + PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor."); PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3], "Input(X) dim[3] and dim[4] should be euqal."); - PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num), - "Input(X) dim[1] should be equal to (anchor_number * (5 " - "+ class_num))."); + PADDLE_ENFORCE_EQ( + dim_x[1], mask_num * (5 + class_num), + "Input(X) dim[1] should be equal to (anchor_mask_number * (5 " + "+ class_num))."); PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3, "Input(GTBox) should be a 3-D tensor"); PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5"); PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2, - "Input(GTBox) should be a 2-D tensor"); + "Input(GTLabel) should be a 2-D tensor"); PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0], "Input(GTBox) and Input(GTLabel) dim[0] should be same"); PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1], @@ -54,11 +64,22 @@ class Yolov3LossOp : public framework::OperatorWithKernel { "Attr(anchors) length should be greater then 0."); PADDLE_ENFORCE_EQ(anchors.size() % 2, 0, "Attr(anchors) length should be even integer."); + for (size_t i = 0; i < anchor_mask.size(); i++) { + PADDLE_ENFORCE_LT( + anchor_mask[i], anchor_num, + "Attr(anchor_mask) should not crossover Attr(anchors)."); + } PADDLE_ENFORCE_GT(class_num, 0, "Attr(class_num) should be an integer greater then 0."); - std::vector dim_out({1}); + std::vector dim_out({dim_x[0]}); ctx->SetOutputDim("Loss", framework::make_ddim(dim_out)); + + std::vector dim_obj_mask({dim_x[0], mask_num, dim_x[2], dim_x[3]}); + ctx->SetOutputDim("ObjectnessMask", framework::make_ddim(dim_obj_mask)); + + std::vector dim_gt_match_mask({dim_gtbox[0], dim_gtbox[1]}); + ctx->SetOutputDim("GTMatchMask", framework::make_ddim(dim_gt_match_mask)); } protected: @@ -73,11 +94,11 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "The input tensor of YOLO v3 loss operator, " + "The input tensor of YOLOv3 loss operator, " "This is a 4-D tensor with shape of [N, C, H, W]." "H and W should be same, and the second dimention(C) stores" "box locations, confidence score and classification one-hot" - "key of each anchor box"); + "keys of each anchor box"); AddInput("GTBox", "The input tensor of ground truth boxes, " "This is a 3-D tensor with shape of [N, max_box_num, 5], " @@ -89,32 +110,39 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("GTLabel", "The input tensor of ground truth label, " "This is a 2-D tensor with shape of [N, max_box_num], " - "and each element shoudl be an integer to indicate the " + "and each element should be an integer to indicate the " "box class id."); AddOutput("Loss", "The output yolov3 loss tensor, " - "This is a 1-D tensor with shape of [1]"); + "This is a 1-D tensor with shape of [N]"); + AddOutput("ObjectnessMask", + "This is an intermediate tensor with shape of [N, M, H, W], " + "M is the number of anchor masks. This parameter caches the " + "mask for calculate objectness loss in gradient kernel.") + .AsIntermediate(); + AddOutput("GTMatchMask", + "This is an intermediate tensor with shape of [N, B], " + "B is the max box number of GT boxes. This parameter caches " + "matched mask index of each GT boxes for gradient calculate.") + .AsIntermediate(); AddAttr("class_num", "The number of classes to predict."); AddAttr>("anchors", "The anchor width and height, " - "it will be parsed pair by pair."); + "it will be parsed pair by pair.") + .SetDefault(std::vector{}); + AddAttr>("anchor_mask", + "The mask index of anchors used in " + "current YOLOv3 loss calculation.") + .SetDefault(std::vector{}); + AddAttr("downsample_ratio", + "The downsample ratio from network input to YOLOv3 loss " + "input, so 32, 16, 8 should be set for the first, second, " + "and thrid YOLOv3 loss operators.") + .SetDefault(32); AddAttr("ignore_thresh", - "The ignore threshold to ignore confidence loss."); - AddAttr("loss_weight_xy", "The weight of x, y location loss.") - .SetDefault(1.0); - AddAttr("loss_weight_wh", "The weight of w, h location loss.") - .SetDefault(1.0); - AddAttr( - "loss_weight_conf_target", - "The weight of confidence score loss in locations with target object.") - .SetDefault(1.0); - AddAttr("loss_weight_conf_notarget", - "The weight of confidence score loss in locations without " - "target object.") - .SetDefault(1.0); - AddAttr("loss_weight_class", "The weight of classification loss.") - .SetDefault(1.0); + "The ignore threshold to ignore confidence loss.") + .SetDefault(0.7); AddComment(R"DOC( This operator generate yolov3 loss by given predict result and ground truth boxes. @@ -147,17 +175,28 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { thresh, the confidence score loss of this anchor box will be ignored. Therefore, the yolov3 loss consist of three major parts, box location loss, - confidence score loss, and classification loss. The MSE loss is used for - box location, and binary cross entropy loss is used for confidence score - loss and classification loss. + confidence score loss, and classification loss. The L2 loss is used for + box coordinates (w, h), and sigmoid cross entropy loss is used for box + coordinates (x, y), confidence score loss and classification loss. + + Each groud truth box find a best matching anchor box in all anchors, + prediction of this anchor box will incur all three parts of losses, and + prediction of anchor boxes with no GT box matched will only incur objectness + loss. + + In order to trade off box coordinate losses between big boxes and small + boxes, box coordinate losses will be mutiplied by scale weight, which is + calculated as follow. + + $$ + weight_{box} = 2.0 - t_w * t_h + $$ Final loss will be represented as follow. $$ - loss = \loss_weight_{xy} * loss_{xy} + \loss_weight_{wh} * loss_{wh} - + \loss_weight_{conf_target} * loss_{conf_target} - + \loss_weight_{conf_notarget} * loss_{conf_notarget} - + \loss_weight_{class} * loss_{class} + loss = (loss_{xy} + loss_{wh}) * weight_{box} + + loss_{conf} + loss_{class} $$ )DOC"); } @@ -196,6 +235,8 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { op->SetInput("GTBox", Input("GTBox")); op->SetInput("GTLabel", Input("GTLabel")); op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); + op->SetInput("ObjectnessMask", Output("ObjectnessMask")); + op->SetInput("GTMatchMask", Output("GTMatchMask")); op->SetAttrMap(Attrs()); diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.h b/paddle/fluid/operators/detection/yolov3_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8407d4e6e8f87a2e8d073c4fbda5691abe1bba68 --- /dev/null +++ b/paddle/fluid/operators/detection/yolov3_loss_op.h @@ -0,0 +1,447 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenTensor = framework::EigenTensor; +template +using EigenVector = framework::EigenVector; + +template +static inline bool LessEqualZero(T x) { + return x < 1e-6; +} + +template +static T SigmoidCrossEntropy(T x, T label) { + return (x > 0 ? x : 0.0) - x * label + std::log(1.0 + std::exp(-std::abs(x))); +} + +template +static T L2Loss(T x, T y) { + return 0.5 * (y - x) * (y - x); +} + +template +static T SigmoidCrossEntropyGrad(T x, T label) { + return 1.0 / (1.0 + std::exp(-x)) - label; +} + +template +static T L2LossGrad(T x, T y) { + return x - y; +} + +static int GetMaskIndex(std::vector mask, int val) { + for (size_t i = 0; i < mask.size(); i++) { + if (mask[i] == val) { + return i; + } + } + return -1; +} + +template +struct Box { + T x, y, w, h; +}; + +template +static inline T sigmoid(T x) { + return 1.0 / (1.0 + std::exp(-x)); +} + +template +static inline Box GetYoloBox(const T* x, std::vector anchors, int i, + int j, int an_idx, int grid_size, + int input_size, int index, int stride) { + Box b; + b.x = (i + sigmoid(x[index])) / grid_size; + b.y = (j + sigmoid(x[index + stride])) / grid_size; + b.w = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] / input_size; + b.h = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] / input_size; + return b; +} + +template +static inline Box GetGtBox(const T* gt, int batch, int max_boxes, int idx) { + Box b; + b.x = gt[(batch * max_boxes + idx) * 4]; + b.y = gt[(batch * max_boxes + idx) * 4 + 1]; + b.w = gt[(batch * max_boxes + idx) * 4 + 2]; + b.h = gt[(batch * max_boxes + idx) * 4 + 3]; + return b; +} + +template +static inline T BoxOverlap(T c1, T w1, T c2, T w2) { + T l1 = c1 - w1 / 2.0; + T l2 = c2 - w2 / 2.0; + T left = l1 > l2 ? l1 : l2; + T r1 = c1 + w1 / 2.0; + T r2 = c2 + w2 / 2.0; + T right = r1 < r2 ? r1 : r2; + return right - left; +} + +template +static inline T CalcBoxIoU(Box b1, Box b2) { + T w = BoxOverlap(b1.x, b1.w, b2.x, b2.w); + T h = BoxOverlap(b1.y, b1.h, b2.y, b2.h); + T inter_area = (w < 0 || h < 0) ? 0.0 : w * h; + T union_area = b1.w * b1.h + b2.w * b2.h - inter_area; + return inter_area / union_area; +} + +static inline int GetEntryIndex(int batch, int an_idx, int hw_idx, int an_num, + int an_stride, int stride, int entry) { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; +} + +template +static void CalcBoxLocationLoss(T* loss, const T* input, Box gt, + std::vector anchors, int an_idx, + int box_idx, int gi, int gj, int grid_size, + int input_size, int stride) { + T tx = gt.x * grid_size - gi; + T ty = gt.y * grid_size - gj; + T tw = std::log(gt.w * input_size / anchors[2 * an_idx]); + T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]); + + T scale = (2.0 - gt.w * gt.h); + loss[0] += SigmoidCrossEntropy(input[box_idx], tx) * scale; + loss[0] += SigmoidCrossEntropy(input[box_idx + stride], ty) * scale; + loss[0] += L2Loss(input[box_idx + 2 * stride], tw) * scale; + loss[0] += L2Loss(input[box_idx + 3 * stride], th) * scale; +} + +template +static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input, + Box gt, std::vector anchors, + int an_idx, int box_idx, int gi, int gj, + int grid_size, int input_size, int stride) { + T tx = gt.x * grid_size - gi; + T ty = gt.y * grid_size - gj; + T tw = std::log(gt.w * input_size / anchors[2 * an_idx]); + T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]); + + T scale = (2.0 - gt.w * gt.h); + input_grad[box_idx] = + SigmoidCrossEntropyGrad(input[box_idx], tx) * scale * loss; + input_grad[box_idx + stride] = + SigmoidCrossEntropyGrad(input[box_idx + stride], ty) * scale * loss; + input_grad[box_idx + 2 * stride] = + L2LossGrad(input[box_idx + 2 * stride], tw) * scale * loss; + input_grad[box_idx + 3 * stride] = + L2LossGrad(input[box_idx + 3 * stride], th) * scale * loss; +} + +template +static inline void CalcLabelLoss(T* loss, const T* input, const int index, + const int label, const int class_num, + const int stride) { + for (int i = 0; i < class_num; i++) { + T pred = input[index + i * stride]; + loss[0] += SigmoidCrossEntropy(pred, (i == label) ? 1.0 : 0.0); + } +} + +template +static inline void CalcLabelLossGrad(T* input_grad, const T loss, + const T* input, const int index, + const int label, const int class_num, + const int stride) { + for (int i = 0; i < class_num; i++) { + T pred = input[index + i * stride]; + input_grad[index + i * stride] = + SigmoidCrossEntropyGrad(pred, (i == label) ? 1.0 : 0.0) * loss; + } +} + +template +static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness, + const int n, const int an_num, const int h, + const int w, const int stride, + const int an_stride) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < an_num; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + T obj = objness[k * w + l]; + if (obj > 1e-5) { + // positive sample: obj = 1 + loss[i] += SigmoidCrossEntropy(input[k * w + l], 1.0); + } else if (obj > -0.5) { + // negetive sample: obj = 0 + loss[i] += SigmoidCrossEntropy(input[k * w + l], 0.0); + } + } + } + objness += stride; + input += an_stride; + } + } +} + +template +static inline void CalcObjnessLossGrad(T* input_grad, const T* loss, + const T* input, const T* objness, + const int n, const int an_num, + const int h, const int w, + const int stride, const int an_stride) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < an_num; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + T obj = objness[k * w + l]; + if (obj > 1e-5) { + input_grad[k * w + l] = + SigmoidCrossEntropyGrad(input[k * w + l], 1.0) * loss[i]; + } else if (obj > -0.5) { + input_grad[k * w + l] = + SigmoidCrossEntropyGrad(input[k * w + l], 0.0) * loss[i]; + } + } + } + objness += stride; + input += an_stride; + input_grad += an_stride; + } + } +} + +template +static void inline GtValid(bool* valid, const T* gtbox, const int n, + const int b) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < b; j++) { + if (LessEqualZero(gtbox[j * 4 + 2]) || LessEqualZero(gtbox[j * 4 + 3])) { + valid[j] = false; + } else { + valid[j] = true; + } + } + valid += b; + gtbox += b * 4; + } +} + +template +class Yolov3LossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* gt_box = ctx.Input("GTBox"); + auto* gt_label = ctx.Input("GTLabel"); + auto* loss = ctx.Output("Loss"); + auto* objness_mask = ctx.Output("ObjectnessMask"); + auto* gt_match_mask = ctx.Output("GTMatchMask"); + auto anchors = ctx.Attr>("anchors"); + auto anchor_mask = ctx.Attr>("anchor_mask"); + int class_num = ctx.Attr("class_num"); + float ignore_thresh = ctx.Attr("ignore_thresh"); + int downsample_ratio = ctx.Attr("downsample_ratio"); + + const int n = input->dims()[0]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int an_num = anchors.size() / 2; + const int mask_num = anchor_mask.size(); + const int b = gt_box->dims()[1]; + int input_size = downsample_ratio * h; + + const int stride = h * w; + const int an_stride = (class_num + 5) * stride; + + const T* input_data = input->data(); + const T* gt_box_data = gt_box->data(); + const int* gt_label_data = gt_label->data(); + T* loss_data = loss->mutable_data({n}, ctx.GetPlace()); + memset(loss_data, 0, loss->numel() * sizeof(T)); + T* obj_mask_data = + objness_mask->mutable_data({n, mask_num, h, w}, ctx.GetPlace()); + memset(obj_mask_data, 0, objness_mask->numel() * sizeof(T)); + int* gt_match_mask_data = + gt_match_mask->mutable_data({n, b}, ctx.GetPlace()); + + // calc valid gt box mask, avoid calc duplicately in following code + Tensor gt_valid_mask; + bool* gt_valid_mask_data = + gt_valid_mask.mutable_data({n, b}, ctx.GetPlace()); + GtValid(gt_valid_mask_data, gt_box_data, n, b); + + for (int i = 0; i < n; i++) { + for (int j = 0; j < mask_num; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + // each predict box find a best match gt box, if overlap is bigger + // then ignore_thresh, ignore the objectness loss. + int box_idx = + GetEntryIndex(i, j, k * w + l, mask_num, an_stride, stride, 0); + Box pred = GetYoloBox(input_data, anchors, l, k, anchor_mask[j], + h, input_size, box_idx, stride); + T best_iou = 0; + for (int t = 0; t < b; t++) { + if (!gt_valid_mask_data[i * b + t]) { + continue; + } + Box gt = GetGtBox(gt_box_data, i, b, t); + T iou = CalcBoxIoU(pred, gt); + if (iou > best_iou) { + best_iou = iou; + } + } + + // If best IoU is bigger then ignore_thresh, + // ignore the objectness loss. + if (best_iou > ignore_thresh) { + int obj_idx = (i * mask_num + j) * stride + k * w + l; + obj_mask_data[obj_idx] = static_cast(-1); + } + // all losses should be calculated if best IoU + // is bigger then truth thresh, but currently, + // truth thresh is an unreachable value as 1.0. + } + } + } + for (int t = 0; t < b; t++) { + if (!gt_valid_mask_data[i * b + t]) { + gt_match_mask_data[i * b + t] = -1; + continue; + } + Box gt = GetGtBox(gt_box_data, i, b, t); + int gi = static_cast(gt.x * w); + int gj = static_cast(gt.y * h); + Box gt_shift = gt; + gt_shift.x = 0.0; + gt_shift.y = 0.0; + T best_iou = 0.0; + int best_n = 0; + // each gt box find a best match anchor box as positive sample, + // for positive sample, all losses should be calculated, and for + // other samples, only objectness loss is required. + for (int an_idx = 0; an_idx < an_num; an_idx++) { + Box an_box; + an_box.x = 0.0; + an_box.y = 0.0; + an_box.w = anchors[2 * an_idx] / static_cast(input_size); + an_box.h = anchors[2 * an_idx + 1] / static_cast(input_size); + float iou = CalcBoxIoU(an_box, gt_shift); + if (iou > best_iou) { + best_iou = iou; + best_n = an_idx; + } + } + + int mask_idx = GetMaskIndex(anchor_mask, best_n); + gt_match_mask_data[i * b + t] = mask_idx; + if (mask_idx >= 0) { + int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, + an_stride, stride, 0); + CalcBoxLocationLoss(loss_data + i, input_data, gt, anchors, best_n, + box_idx, gi, gj, h, input_size, stride); + + int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi; + obj_mask_data[obj_idx] = 1.0; + + int label = gt_label_data[i * b + t]; + int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, + an_stride, stride, 5); + CalcLabelLoss(loss_data + i, input_data, label_idx, label, + class_num, stride); + } + } + } + + CalcObjnessLoss(loss_data, input_data + 4 * stride, obj_mask_data, n, + mask_num, h, w, stride, an_stride); + } +}; + +template +class Yolov3LossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* gt_box = ctx.Input("GTBox"); + auto* gt_label = ctx.Input("GTLabel"); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + auto* loss_grad = ctx.Input(framework::GradVarName("Loss")); + auto* objness_mask = ctx.Input("ObjectnessMask"); + auto* gt_match_mask = ctx.Input("GTMatchMask"); + auto anchors = ctx.Attr>("anchors"); + auto anchor_mask = ctx.Attr>("anchor_mask"); + int class_num = ctx.Attr("class_num"); + int downsample_ratio = ctx.Attr("downsample_ratio"); + + const int n = input_grad->dims()[0]; + const int c = input_grad->dims()[1]; + const int h = input_grad->dims()[2]; + const int w = input_grad->dims()[3]; + const int mask_num = anchor_mask.size(); + const int b = gt_match_mask->dims()[1]; + int input_size = downsample_ratio * h; + + const int stride = h * w; + const int an_stride = (class_num + 5) * stride; + + const T* input_data = input->data(); + const T* gt_box_data = gt_box->data(); + const int* gt_label_data = gt_label->data(); + const T* loss_grad_data = loss_grad->data(); + const T* obj_mask_data = objness_mask->data(); + const int* gt_match_mask_data = gt_match_mask->data(); + T* input_grad_data = + input_grad->mutable_data({n, c, h, w}, ctx.GetPlace()); + memset(input_grad_data, 0, input_grad->numel() * sizeof(T)); + + for (int i = 0; i < n; i++) { + for (int t = 0; t < b; t++) { + int mask_idx = gt_match_mask_data[i * b + t]; + if (mask_idx >= 0) { + Box gt = GetGtBox(gt_box_data, i, b, t); + int gi = static_cast(gt.x * w); + int gj = static_cast(gt.y * h); + + int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, + an_stride, stride, 0); + CalcBoxLocationLossGrad( + input_grad_data, loss_grad_data[i], input_data, gt, anchors, + anchor_mask[mask_idx], box_idx, gi, gj, h, input_size, stride); + + int label = gt_label_data[i * b + t]; + int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, + an_stride, stride, 5); + CalcLabelLossGrad(input_grad_data, loss_grad_data[i], input_data, + label_idx, label, class_num, stride); + } + } + } + + CalcObjnessLossGrad(input_grad_data + 4 * stride, loss_grad_data, + input_data + 4 * stride, obj_mask_data, n, mask_num, + h, w, stride, an_stride); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h index 27ca1f4edc04f5fca54b1a6340243634a596939c..e9f06f54327875c0568c571627e9effb998e15be 100644 --- a/paddle/fluid/operators/distributed/proto_encoder_helper.h +++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h @@ -85,7 +85,7 @@ class ProtoEncodeHelper { #define REPLACE_ENFORCE_GLOG 1 // Make sure callers didn't do operations that went over max_size promised if (paddle::platform::is_error(p_ <= limit_)) { - paddle::platform::throw_on_error(p_ <= limit_); + paddle::platform::throw_on_error(p_ <= limit_, ""); } #undef REPLACE_ENFORCE_GLOG } diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu index d65491267de1ce3495d8b8250cf0cff570dfcc6a..7a6927d3e54b4ece8f17d7a1e7e431ba836edff9 100644 --- a/paddle/fluid/operators/dropout_op.cu +++ b/paddle/fluid/operators/dropout_op.cu @@ -114,4 +114,5 @@ REGISTER_OP_CUDA_KERNEL( ops::GPUDropoutKernel); REGISTER_OP_CUDA_KERNEL( dropout_grad, ops::DropoutGradKernel, + ops::DropoutGradKernel, ops::DropoutGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index 7e789cd8d9143164c2346b067855eb904e00075f..c6c658236c235f0a6767924026b0a7610071e918 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -18,6 +18,7 @@ namespace ops = paddle::operators; REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add); REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out", "X"); + REGISTER_OP_CPU_KERNEL( elementwise_add, ops::ElementwiseAddKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index fd2a98cb45f446e80a4be1b50e94ee611cd23e62..d04bb8f338a80946e8f1d945f66122f02f526eac 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -250,6 +250,20 @@ class ElemwiseGradKernel : public framework::OpKernel { } }; +class ElementwiseOpInplace : public framework::InplaceInToOut { + public: + using framework::InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + return std::unordered_map{ + {"X", "Out"}, + }; + } +}; + } // namespace operators } // namespace paddle @@ -299,6 +313,7 @@ class ElemwiseGradKernel : public framework::OpKernel { REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \ __ElemwiseOp##op_type##Maker__, \ ::paddle::operators::ElementwiseOpInferVarType, \ - op_type##GradMaker); \ + op_type##GradMaker, \ + ::paddle::operators::ElementwiseOpInplace); \ REGISTER_OPERATOR(op_type##_grad, \ ::paddle::operators::ElementwiseOpExplicitGrad) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc similarity index 100% rename from paddle/fluid/operators/elementwise/elementwise_add_mkldnn_op.cc rename to paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc similarity index 100% rename from paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc rename to paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc index 8e80dc0e641c443923076c31e269689b5bc134a7..bb904166c4a19997a57723d9f2e50cc839aae960 100644 --- a/paddle/fluid/operators/flatten_op.cc +++ b/paddle/fluid/operators/flatten_op.cc @@ -267,6 +267,35 @@ class Flatten2GradOp : public framework::OperatorBase { } }; +class FlattenOpInplaceInToOut : public framework::InplaceInToOut { + public: + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map inplace_in_to_out = { + {"X", "Out"}, + }; + return inplace_in_to_out; + } +}; + +class FlattenGradInplaceinToOut : public framework::InplaceInToOut { + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map inplace_in_to_out = { + {framework::GradVarName("Out"), framework::GradVarName("X")}, + }; + return inplace_in_to_out; + } +}; + } // namespace operators } // namespace paddle @@ -275,10 +304,13 @@ USE_OP(reshape); namespace ops = paddle::operators; REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker, ops::FlattenOpInferShape, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape); + paddle::framework::DefaultGradOpDescMaker, + ops::FlattenOpInplaceInToOut); +REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape, + ops::FlattenGradInplaceinToOut); REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker, - ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker); + ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker, + ops::FlattenOpInplaceInToOut); REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp, - ops::Flatten2GradInferShape); + ops::Flatten2GradInferShape, ops::FlattenGradInplaceinToOut); diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc index a35ee8a09ed5ddcc4ac465d200b84358fa65b2f3..e9e2a3b1f5c1c00bb2e95b6171ecd09bfe7a0d21 100644 --- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc @@ -79,17 +79,17 @@ void FusionRepeatedFCReluOpMaker::Make() { } template -static void fc_relu(const T* x, const T* w, const T* b, T* y, int m, int n, - int k) { +static void fc_relu(const T* x, const T* w, const T* b, T* y, + const jit::matmul_attr_t& attr) { auto matmul = - jit::Get, platform::CPUPlace>(k); + jit::Get, platform::CPUPlace>(attr); auto addbias_relu = - jit::Get, platform::CPUPlace>(n); - matmul(x, w, y, m, n, k); + jit::Get, platform::CPUPlace>(attr.n); + matmul(x, w, y, &attr); T* dst = y; - for (int i = 0; i < m; ++i) { - addbias_relu(b, dst, dst, n); - dst += n; + for (int i = 0; i < attr.m; ++i) { + addbias_relu(b, dst, dst, attr.n); + dst += attr.n; } } @@ -107,32 +107,33 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel { auto i_dims = in->dims(); auto w_dims = weights[0]->dims(); - int m = i_dims[0]; - int n = w_dims[1]; - int k = w_dims[0]; - relus[0]->Resize({m, n}); + jit::matmul_attr_t attr; + attr.m = i_dims[0]; + attr.n = w_dims[1]; + attr.k = w_dims[0]; + relus[0]->Resize({attr.m, attr.n}); fc_relu(in->data(), weights[0]->data(), biases[0]->data(), - relus[0]->mutable_data(place), m, n, k); + relus[0]->mutable_data(place), attr); for (int i = 1; i < weight_sz - 1; ++i) { auto i_dims = relus[i - 1]->dims(); auto w_dims = weights[i]->dims(); - int m = i_dims[0]; - int n = w_dims[1]; - int k = w_dims[0]; - relus[i]->Resize({m, n}); + attr.m = i_dims[0]; + attr.n = w_dims[1]; + attr.k = w_dims[0]; + relus[i]->Resize({attr.m, attr.n}); fc_relu(relus[i - 1]->data(), weights[i]->data(), - biases[i]->data(), relus[i]->mutable_data(place), m, n, k); + biases[i]->data(), relus[i]->mutable_data(place), attr); } auto i_dims_last = relus[weight_sz - 2]->dims(); auto w_dims_last = weights[weight_sz - 1]->dims(); - m = i_dims_last[0]; - n = w_dims_last[1]; - k = w_dims_last[0]; + attr.m = i_dims_last[0]; + attr.n = w_dims_last[1]; + attr.k = w_dims_last[0]; fc_relu(relus[weight_sz - 2]->data(), weights[weight_sz - 1]->data(), - biases[weight_sz - 1]->data(), out->mutable_data(place), m, n, - k); + biases[weight_sz - 1]->data(), out->mutable_data(place), + attr); } }; diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc index 00dafdead53bbd4614c70875441c565724fca46d..8c8b079633aacb711aa304ec7016c37c6bec61ce 100644 --- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc +++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc @@ -87,15 +87,18 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { auto x_dims = x->dims(); auto y_dims = y->dims(); - int m = x_dims[0]; - int k = x_dims[1]; - int n = y_dims[1]; - int o_numel = m * n; + jit::matmul_attr_t attr; + attr.m = x_dims[0]; + attr.k = x_dims[1]; + attr.n = y_dims[1]; + int o_numel = attr.m * attr.n; auto vsquare_x = - jit::Get, platform::CPUPlace>(m * k); + jit::Get, platform::CPUPlace>(attr.m * + attr.k); auto vsquare_y = - jit::Get, platform::CPUPlace>(k * n); + jit::Get, platform::CPUPlace>(attr.k * + attr.n); auto vsquare_xy = jit::Get, platform::CPUPlace>(o_numel); auto vsub = @@ -103,7 +106,7 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { auto vscal = jit::Get, platform::CPUPlace>(o_numel); auto matmul = - jit::Get, platform::CPUPlace>(k); + jit::Get, platform::CPUPlace>(attr); const T* x_data = x->data(); const T* y_data = y->data(); @@ -112,12 +115,12 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { T* squared_xy_data = squared_xy->mutable_data(place); T* o_data = out->mutable_data(place); - matmul(x_data, y_data, squared_xy_data, m, n, k); + matmul(x_data, y_data, squared_xy_data, &attr); vsquare_xy(squared_xy_data, squared_xy_data, o_numel); - vsquare_x(x_data, squared_x_data, m * k); - vsquare_y(y_data, squared_y_data, k * n); - matmul(squared_x_data, squared_y_data, o_data, m, n, k); + vsquare_x(x_data, squared_x_data, attr.m * attr.k); + vsquare_y(y_data, squared_y_data, attr.k * attr.n); + matmul(squared_x_data, squared_y_data, o_data, &attr); vsub(squared_xy_data, o_data, o_data, o_numel); vscal(&scalar, o_data, o_data, o_numel); diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu index 9f4aef08cd58e72ce344a640e6564b9e360ce169..490ba9a585ee8fac82a9e1178f506a6d39e5fd1c 100644 --- a/paddle/fluid/operators/gather_op.cu +++ b/paddle/fluid/operators/gather_op.cu @@ -31,7 +31,7 @@ class GatherOpCUDAKernel : public framework::OpKernel { auto *output = ctx.Output("Out"); output->mutable_data(ctx.GetPlace()); - + if (x->numel() == 0) return; GPUGather(ctx.device_context(), *x, *index, output); } }; @@ -45,14 +45,13 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { auto *Index = ctx.Input("Index"); auto *dX = ctx.Output(framework::GradVarName("X")); auto *dO = ctx.Input(framework::GradVarName("Out")); - auto *x = ctx.Input("X"); dX->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*dX); auto &place = *ctx.template device_context() .eigen_device(); dxt.device(place) = dxt.constant(static_cast(0)); - + if (dO->numel() == 0) return; GPUScatterAssign(ctx.device_context(), *dO, *Index, dX); } }; @@ -61,11 +60,14 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel); + ops::GatherOpCUDAKernel, + ops::GatherOpCUDAKernel); REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel, ops::GatherGradOpCUDAKernel, ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel); + ops::GatherGradOpCUDAKernel, + ops::GatherGradOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h index 2dd726bebb1bc2e4d83844c0b98df01c390e622f..2e18298cf8e34d5f70369c89b3b3b2a9ced0ce62 100644 --- a/paddle/fluid/operators/gather_op.h +++ b/paddle/fluid/operators/gather_op.h @@ -35,7 +35,7 @@ class GatherOpKernel : public framework::OpKernel { auto *output = ctx.Output("Out"); output->mutable_data(ctx.GetPlace()); - + if (x->numel() == 0) return; CPUGather(ctx.device_context(), *x, *index, output); } }; @@ -56,7 +56,7 @@ class GatherGradientOpKernel : public framework::OpKernel { auto &place = *ctx.template device_context() .eigen_device(); dxt.device(place) = dxt.constant(static_cast(0)); - + if (dO->numel() == 0) return; ScatterAssign(ctx.device_context(), *dO, *Index, dX); } }; diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 2247131137d010dd1872c44c186d75069eb7f308..751091478e286f8252d4b1bd19e1e9ae879dd62c 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -136,7 +136,7 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { sum.mutable_data(framework::make_ddim(sum_dims), ctx.GetPlace()); auto sum_mat = EigenMatrix::From(sum); out->mutable_data(ctx.GetPlace()); - auto out_mat = framework::EigenVector::Flatten(*out); + auto out_mat = framework::EigenMatrix::From(*out); if (bias) { bit_code->Add(*bias, pre_out); } diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index 93dd3f794f6087a3158fee1f262795871f21611a..de91ba6270ac2ed22c8380878c0a0037fb1629c0 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -82,6 +82,18 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker { "bilinear interpolation and \"nearest\" for nearest " "neighbor interpolation.") .SetDefault("bilinear"); + AddAttr( + "align_corners", + "an optinal bool. Defaults to True. " + "If True, the centers of 4 corner pixels of the input and output " + "tensors are aligned, preserving the values at the corner pixels, " + "if Flase, are not aligned") + .SetDefault(true); + AddAttr("align_mode", + "(int, default \'1\'), optional for bilinear interpolation" + "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , " + "can be \'1\' for src_idx = scale*dst_index .") + .SetDefault(1); AddComment(R"DOC( This operator samples input X to given output shape by using specified interpolation method, the interpolation methods can be \"nearest\" @@ -98,6 +110,64 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker { to perform linear interpolation first in one direction, and then again in the other direction. + Align_corners and align_mode are optinal parameters,the calculation method + of interpolation can be selected by them. + + Example: + + For scale: + + if align_corners = True and out_{size}>1 : + + scale_{factor} = (in_{size}-1.0)/(out_{size}-1.0) + + else: + + scale_{factor} = float(in_{size}/out_{size}) + + + Nearest neighbor interpolation: + + if: + align_corners = False + + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + + H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor + W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor + + else: + align_corners = True + + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + + H_out = round(H_{in} * scale_{factor}) + W_out = round(W_{in} * scale_{factor}) + + Bilinear interpolation: + + if: + align_corners = False , align_mode = 0 + + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + + H_out = (H_{in}+0.5) * scale_{factor} - 0.5 + W_out = (W_{in}+0.5) * scale_{factor} - 0.5 + + + else: + + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + + H_out = H_{in} * scale_{factor} + W_out = W_{in} * scale_{factor} + + + For details of nearest neighbor interpolation, please refer to Wikipedia: https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu index 99ac725f73bf60ab0fb9a467432e8a57c646ef35..b887878ea2291d6c56fec91738784e338606b84f 100644 --- a/paddle/fluid/operators/interpolate_op.cu +++ b/paddle/fluid/operators/interpolate_op.cu @@ -23,7 +23,8 @@ __global__ void KeNearestNeighborInterpFw( const T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const float ratio_h, const float ratio_w) { + const size_t num_channels, const float ratio_h, const float ratio_w, + const bool align_corners) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; @@ -35,10 +36,14 @@ __global__ void KeNearestNeighborInterpFw( int channel_id = out_id_w / out_img_size; int out_img_idy = (out_id_w % out_img_size) / out_img_w; - int in_img_idy = static_cast(ratio_h * out_img_idy + 0.5); + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); int out_img_idx = tid % out_img_w; - int in_img_idx = static_cast(ratio_w * out_img_idx + 0.5); + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); out[tid] = in[out_id_h * input_w + channel_id * in_img_size + in_img_idy * in_img_w + in_img_idx]; @@ -50,7 +55,8 @@ __global__ void KeNearestNeighborInterpBw( T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, const size_t input_w, const T* out, const size_t out_img_h, const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const float ratio_h, const float ratio_w) { + const size_t num_channels, const float ratio_h, const float ratio_w, + const bool align_corners) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; @@ -62,10 +68,14 @@ __global__ void KeNearestNeighborInterpBw( int channel_id = out_id_w / out_img_size; int out_img_idy = (out_id_w % out_img_size) / out_img_w; - int in_img_idy = static_cast(ratio_h * out_img_idy + 0.5); + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); int out_img_idx = tid % out_img_w; - int in_img_idx = static_cast(ratio_w * out_img_idx + 0.5); + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + in_img_idy * in_img_w + in_img_idx]; @@ -79,10 +89,12 @@ __global__ void KeBilinearInterpFw( const T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const float ratio_h, const float ratio_w) { + const size_t num_channels, const float ratio_h, const float ratio_w, + const bool align_corners, const int align_mode) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; + bool align_flag = (align_mode == 0 && !align_corners); for (; tid < nthreads; tid += stride) { int out_id_h = tid / output_w; int out_id_w = tid % output_w; @@ -91,15 +103,23 @@ __global__ void KeBilinearInterpFw( int channel_id = out_id_w / out_img_size; int out_img_idy = (out_id_w % out_img_size) / out_img_w; - int in_img_idy = ratio_h * out_img_idy; + int in_img_idy = align_flag + ? static_cast(ratio_h * (out_img_idy + 0.5) - 0.5) + : static_cast(ratio_h * out_img_idy); + in_img_idy = (in_img_idy > 0) ? in_img_idy : 0; int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; - T h1lambda = ratio_h * out_img_idy - in_img_idy; + T h1lambda = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5 - in_img_idy + : ratio_h * out_img_idy - in_img_idy; T h2lambda = 1.f - h1lambda; int out_img_idx = tid % out_img_w; - int in_img_idx = ratio_w * out_img_idx; + int in_img_idx = align_flag + ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) + : static_cast(ratio_w * out_img_idx); + in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; - T w1lambda = ratio_w * out_img_idx - in_img_idx; + T w1lambda = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5 - in_img_idx + : ratio_w * out_img_idx - in_img_idx; T w2lambda = 1.f - w1lambda; const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + @@ -118,10 +138,12 @@ __global__ void KeBilinearInterpBw( T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, const size_t input_w, const T* out, const size_t out_img_h, const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const T ratio_h, const T ratio_w) { + const size_t num_channels, const T ratio_h, const T ratio_w, + const bool align_corners, const int align_mode) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; + bool align_flag = (align_mode == 0 && !align_corners); for (; tid < nthreads; tid += stride) { int out_id_h = tid / output_w; int out_id_w = tid % output_w; @@ -130,15 +152,22 @@ __global__ void KeBilinearInterpBw( int channel_id = out_id_w / out_img_size; int out_img_idy = (out_id_w % out_img_size) / out_img_w; - int in_img_idy = ratio_h * out_img_idy; + int in_img_idy = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5 + : ratio_h * out_img_idy; + in_img_idy = (in_img_idy > 0) ? in_img_idy : 0; int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; - T h1lambda = ratio_h * out_img_idy - in_img_idy; + T h1lambda = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5 - in_img_idy + : ratio_h * out_img_idy - in_img_idy; + T h2lambda = 1.f - h1lambda; int out_img_idx = tid % out_img_w; - int in_img_idx = ratio_w * out_img_idx; + int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5 + : ratio_w * out_img_idx; + in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; - T w1lambda = ratio_w * out_img_idx - in_img_idx; + T w1lambda = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5 - in_img_idx + : ratio_w * out_img_idx - in_img_idx; T w2lambda = 1.f - w1lambda; T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + @@ -175,6 +204,9 @@ class InterpolateOpCUDAKernel : public framework::OpKernel { out_w = size_data[1]; } + bool align_corners = ctx.Attr("align_corners"); + int align_mode = ctx.Attr("align_mode"); + int n = input->dims()[0]; int c = input->dims()[1]; int in_h = input->dims()[2]; @@ -188,10 +220,16 @@ class InterpolateOpCUDAKernel : public framework::OpKernel { int in_chw = c * in_hw; int out_chw = c * out_hw; - float ratio_h = - (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; - float ratio_w = - (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(in_h) / out_h; + } + if (out_w > 1) { + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(in_w) / out_w; + } if (in_h == out_h && in_w == out_w) { framework::TensorCopy(*input, ctx.GetPlace(), output); @@ -206,12 +244,12 @@ class InterpolateOpCUDAKernel : public framework::OpKernel { KeNearestNeighborInterpFw< T><<>>( input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, - out_chw, c, ratio_h, ratio_w); + out_chw, c, ratio_h, ratio_w, align_corners); } else if ("bilinear" == interp_method) { KeBilinearInterpFw< T><<>>( input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, - out_chw, c, ratio_h, ratio_w); + out_chw, c, ratio_h, ratio_w, align_corners, align_mode); } } }; @@ -234,6 +272,10 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel { int out_h = ctx.Attr("out_h"); int out_w = ctx.Attr("out_w"); auto out_size = ctx.Input("OutSize"); + + bool align_corners = ctx.Attr("align_corners"); + int align_mode = ctx.Attr("align_mode"); + if (out_size != nullptr) { Tensor sizes; framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes); @@ -252,10 +294,16 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel { int in_chw = c * in_hw; int out_chw = c * out_hw; - float ratio_h = - (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; - float ratio_w = - (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(in_h) / out_h; + } + if (out_w > 1) { + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(in_w) / out_w; + } if (in_h == out_h && in_w == out_w) { framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad); @@ -270,12 +318,12 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel { KeNearestNeighborInterpBw< T><<>>( input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, - out_w, n, out_chw, c, ratio_h, ratio_w); + out_w, n, out_chw, c, ratio_h, ratio_w, align_corners); } else if ("bilinear" == interp_method) { KeBilinearInterpBw< T><<>>( input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, - out_w, n, out_chw, c, ratio_h, ratio_w); + out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode); } } }; diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h index 7fdb3e1f5a2ff82284d89dd0759e357978e1d873..c631ad1dd158ce114169602f073d69b2291b5b3b 100644 --- a/paddle/fluid/operators/interpolate_op.h +++ b/paddle/fluid/operators/interpolate_op.h @@ -26,14 +26,17 @@ template static void NearestNeighborInterpolate(const Tensor& input, Tensor* output, const float ratio_h, const float ratio_w, const int n, const int c, - const int out_h, const int out_w) { + const int out_h, const int out_w, + const bool align_corners) { auto input_t = EigenTensor::From(input); auto output_t = EigenTensor::From(*output); for (int k = 0; k < out_h; k++) { // loop for images - int in_k = static_cast(ratio_h * k + 0.5); + int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) + : static_cast(ratio_h * k); for (int l = 0; l < out_w; l++) { - int in_l = static_cast(ratio_w * l + 0.5); + int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) + : static_cast(ratio_w * l); for (int i = 0; i < n; i++) { // loop for batches for (int j = 0; j < c; j++) { // loop for channels @@ -48,20 +51,29 @@ template static void BilinearInterpolation(const Tensor& input, Tensor* output, const float ratio_h, const float ratio_w, const int in_h, const int in_w, const int n, - const int c, const int out_h, - const int out_w) { + const int c, const int out_h, const int out_w, + const bool align_corners, + const bool align_mode) { auto input_t = EigenTensor::From(input); auto output_t = EigenTensor::From(*output); + bool align_flag = (align_mode == 0 && !align_corners); for (int k = 0; k < out_h; k++) { // loop for images - int y_n = static_cast(ratio_h * k); + int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) + : static_cast(ratio_h * k); + y_n = (y_n > 0) ? y_n : 0; int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); - float d_n = ratio_h * k - y_n; + float d_n = + align_flag ? ratio_h * (k + 0.5) - 0.5 - y_n : ratio_h * k - y_n; float d_s = 1.f - d_n; for (int l = 0; l < out_w; l++) { - int x_w = static_cast(ratio_w * l); + int x_w = (align_mode == 0 && !align_corners) + ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + x_w = (x_w > 0) ? x_w : 0; int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); - float d_w = ratio_w * l - x_w; + float d_w = + align_flag ? ratio_w * (l + 0.5) - 0.5 - x_w : ratio_w * l - x_w; float d_e = 1.f - d_w; for (int i = 0; i < n; i++) { // loop for batches @@ -78,19 +90,20 @@ static void BilinearInterpolation(const Tensor& input, Tensor* output, } template -static void NearestNeighborInterpolateGrad(const Tensor& output_grad, - Tensor* input_grad, - const float ratio_h, - const float ratio_w, const int n, - const int c, const int out_h, - const int out_w) { +static void NearestNeighborInterpolateGrad( + const Tensor& output_grad, Tensor* input_grad, const float ratio_h, + const float ratio_w, const int n, const int c, const int out_h, + const int out_w, const bool align_corners) { auto input_grad_t = EigenTensor::From(*input_grad); auto output_grad_t = EigenTensor::From(output_grad); + for (int k = 0; k < out_h; k++) { // loop for images - int in_k = static_cast(ratio_h * k + 0.5); + int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) + : static_cast(ratio_h * k); for (int l = 0; l < out_w; l++) { - int in_l = static_cast(ratio_w * l + 0.5); + int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) + : static_cast(ratio_w * l); for (int i = 0; i < n; i++) { // loop for batches for (int j = 0; j < c; j++) { // loop for channels @@ -106,19 +119,28 @@ static void BilinearInterpolationGrad(const Tensor& output_grad, Tensor* input_grad, const float ratio_h, const float ratio_w, const int in_h, const int in_w, const int n, const int c, - const int out_h, const int out_w) { + const int out_h, const int out_w, + const bool align_corners, + const int align_mode) { auto input_grad_t = EigenTensor::From(*input_grad); auto output_grad_t = EigenTensor::From(output_grad); + bool align_flag = (align_mode == 0 && !align_corners); for (int k = 0; k < out_h; k++) { // loop for images - int y_n = static_cast(ratio_h * k); + int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) + : static_cast(ratio_h * k); + y_n = (y_n > 0) ? y_n : 0; int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); - float d_n = ratio_h * k - y_n; + float d_n = + align_flag ? ratio_h * (k + 0.5) - 0.5 - y_n : ratio_h * k - y_n; float d_s = 1.f - d_n; for (int l = 0; l < out_w; l++) { - int x_w = static_cast(ratio_w * l); + int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + x_w = (x_w > 0) ? x_w : 0; int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); - float d_w = ratio_w * l - x_w; + float d_w = + align_flag ? ratio_w * (l + 0.5) - 0.5 - x_w : ratio_w * l - x_w; float d_e = 1.f - d_w; for (int i = 0; i < n; i++) { // loop for batches @@ -134,7 +156,6 @@ static void BilinearInterpolationGrad(const Tensor& output_grad, } } } - template class InterpolateKernel : public framework::OpKernel { public: @@ -151,6 +172,8 @@ class InterpolateKernel : public framework::OpKernel { out_h = out_size_data[0]; out_w = out_size_data[1]; } + bool align_corners = ctx.Attr("align_corners"); + int align_mode = ctx.Attr("align_mode"); const int n = input->dims()[0]; const int c = input->dims()[1]; @@ -168,17 +191,24 @@ class InterpolateKernel : public framework::OpKernel { return; } - float ratio_h = - (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; - float ratio_w = - (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; + float ratio_h = 0.f; + float ratio_w = 0.f; + + if (out_h > 1) { + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(in_h) / out_h; + } + if (out_w > 1) { + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(in_w) / out_w; + } if ("bilinear" == interp_method) { BilinearInterpolation(*input, output, ratio_h, ratio_w, in_h, in_w, n, - c, out_h, out_w); + c, out_h, out_w, align_corners, align_mode); } else if ("nearest" == interp_method) { NearestNeighborInterpolate(*input, output, ratio_h, ratio_w, n, c, - out_h, out_w); + out_h, out_w, align_corners); } } }; @@ -200,6 +230,8 @@ class InterpolateGradKernel : public framework::OpKernel { out_h = out_size_data[0]; out_w = out_size_data[1]; } + bool align_corners = ctx.Attr("align_corners"); + int align_mode = ctx.Attr("align_mode"); const int n = input->dims()[0]; const int c = input->dims()[1]; @@ -217,17 +249,26 @@ class InterpolateGradKernel : public framework::OpKernel { return; } - float ratio_h = - (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; - float ratio_w = - (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; + float ratio_h = 0.f; + float ratio_w = 0.f; + + if (out_h > 1) { + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(in_h) / out_h; + } + if (out_w > 1) { + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(in_w) / out_w; + } if ("bilinear" == interp_method) { BilinearInterpolationGrad(*output_grad, input_grad, ratio_h, ratio_w, - in_h, in_w, n, c, out_h, out_w); + in_h, in_w, n, c, out_h, out_w, + align_corners, align_mode); } else if ("nearest" == interp_method) { NearestNeighborInterpolateGrad(*output_grad, input_grad, ratio_h, - ratio_w, n, c, out_h, out_w); + ratio_w, n, c, out_h, out_w, + align_corners); } } }; diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 186c37c56ec9410ac9a31503e33e7e334d0afc40..97ddf223aefcdfaf8a488f93a152336c1ed458f4 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -93,6 +93,7 @@ std::vector TestSizes() { template struct BenchFunc { // return this function avg time + // TODO(TJ): clear cache every time double operator()(const typename KernelTuples::func_type tgt, Args... args) { for (int i = 0; i < FLAGS_burning; ++i) { tgt(args...); @@ -158,7 +159,7 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { using Tensor = paddle::framework::Tensor; -template +template void BenchXYZNKernel() { for (int d : TestSizes()) { Tensor x, y, z; @@ -172,10 +173,13 @@ void BenchXYZNKernel() { RandomVec(d, y_data); BenchAllImpls, PlaceType>(d, x.data(), y.data(), z_data, d); + // test inplace + BenchAllImpls, PlaceType>(d, x.data(), z_data, + z_data, d); } } -template +template void BenchAXYNKernel() { for (int d : TestSizes()) { const T a = static_cast(3); @@ -187,10 +191,23 @@ void BenchAXYNKernel() { RandomVec(d, x_data); BenchAllImpls, PlaceType>(d, &a, x.data(), y_data, d); + // test inplace + BenchAllImpls, PlaceType>(d, &a, x.data(), x_data, + d); + } +} + +template +void BenchXRNKernel() { + for (int d : TestSizes()) { + Tensor x; + RandomVec(d, x.mutable_data({d}, PlaceType())); + T res; + BenchAllImpls, PlaceType>(d, x.data(), &res, d); } } -template +template void BenchXYNKernel() { for (int d : TestSizes()) { Tensor x, y; @@ -203,7 +220,7 @@ void BenchXYNKernel() { } } -template +template void BenchLSTMKernel() { for (bool use_peephole : {true, false}) { for (int d : TestSizes()) { @@ -240,7 +257,7 @@ void BenchLSTMKernel() { } } -template +template void BenchGRUKernel() { for (int d : TestSizes()) { const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh); @@ -262,7 +279,7 @@ void BenchGRUKernel() { } } -template +template void BenchSeqPoolKernel() { std::vector pool_types = { jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; @@ -284,7 +301,7 @@ void BenchSeqPoolKernel() { } } -template +template void BenchMatMulKernel() { for (int m : {1, 2, 3, 4}) { for (int n : TestSizes()) { @@ -298,64 +315,72 @@ void BenchMatMulKernel() { const T* a_data = a.data(); const T* b_data = b.data(); T* c_data = c.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>(k, a_data, b_data, - c_data, m, n, k); + const jit::matmul_attr_t attr{m, n, k}; + BenchAllImpls, PlaceType>(attr, a_data, b_data, + c_data, &attr); } } } } +template +void BenchSoftmaxKernel() { + for (int bs : {1, 2, 10}) { + for (int n : TestSizes()) { + Tensor x, y; + x.Resize({bs, n}); + y.Resize({bs, n}); + RandomVec(bs * n, x.mutable_data(PlaceType()), -2.f, 2.f); + const T* x_data = x.data(); + T* y_data = y.mutable_data(PlaceType()); + BenchAllImpls, PlaceType>(n, x_data, y_data, n, + bs); + } + } +} + using T = float; -using PlaceType = paddle::platform::CPUPlace; +using CPUPlace = paddle::platform::CPUPlace; // xyzn -BENCH_FP32_CPU(kVMul) { BenchXYZNKernel(); } - -BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel(); } - -BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel(); } - -BENCH_FP32_CPU(kVSub) { BenchXYZNKernel(); } +BENCH_FP32_CPU(kVMul) { BenchXYZNKernel(); } +BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel(); } +BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel(); } +BENCH_FP32_CPU(kVSub) { BenchXYZNKernel(); } // axyn -BENCH_FP32_CPU(kVScal) { BenchAXYNKernel(); } +BENCH_FP32_CPU(kVScal) { BenchAXYNKernel(); } +BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel(); } -BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel(); } +// xrn +BENCH_FP32_CPU(kHSum) { BenchXRNKernel(); } +BENCH_FP32_CPU(kHMax) { BenchXRNKernel(); } // xyn -BENCH_FP32_CPU(kVRelu) { BenchXYNKernel(); } - -BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel(); } - -BENCH_FP32_CPU(kVSquare) { BenchXYNKernel(); } - -BENCH_FP32_CPU(kVExp) { BenchXYNKernel(); } - -BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel(); } - -BENCH_FP32_CPU(kVTanh) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVRelu) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVSquare) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVExp) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVTanh) { BenchXYNKernel(); } // lstm and peephole -BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel(); } - -BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel(); } +BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel(); } +BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel(); } // gru functions -BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel(); } - -BENCH_FP32_CPU(kGRUHtPart1) { - BenchGRUKernel(); -} - -BENCH_FP32_CPU(kGRUHtPart2) { - BenchGRUKernel(); -} +BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel(); } +BENCH_FP32_CPU(kGRUHtPart1) { BenchGRUKernel(); } +BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel(); } // seq pool function -BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel(); } +BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel(); } // matmul -BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel(); } +BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel(); } + +// softmax +BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel(); } // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index 40310c2d2b372a414054f75348e8e1b4471bf3d2..efc7eb79d36c5cf9fac4ac40db4e2e28cb242e22 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -9,6 +9,7 @@ function(USE_JITKERNEL_GEN TARGET) endfunction() # use gen jitcode kernel by name +USE_JITKERNEL_GEN(kMatMul) USE_JITKERNEL_GEN(kVMul) USE_JITKERNEL_GEN(kVAdd) USE_JITKERNEL_GEN(kVSub) @@ -28,3 +29,5 @@ USE_JITKERNEL_GEN(kGRUHtPart1) USE_JITKERNEL_GEN(kGRUHtPart2) USE_JITKERNEL_GEN(kNCHW16CMulNC) USE_JITKERNEL_GEN(kSeqPool) +USE_JITKERNEL_GEN(kHMax) +USE_JITKERNEL_GEN(kHSum) diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc index a2a5661b93ad3d885983c502566860aa313d110f..e7a7375879064eb27c94315fe7b93eece7866b92 100644 --- a/paddle/fluid/operators/jit/gen/act.cc +++ b/paddle/fluid/operators/jit/gen/act.cc @@ -81,9 +81,7 @@ void VActJitCode::genCode() { #define DECLARE_ACT_CREATOR(name) \ class name##Creator : public JitCodeCreator { \ public: \ - bool UseMe(const int& attr) const override { \ - return platform::MayIUse(platform::avx); \ - } \ + bool UseMe(const int& attr) const override; \ size_t CodeSize(const int& d) const override; \ std::unique_ptr CreateJitCode(const int& attr) const override { \ return make_unique(attr, CodeSize(attr)); \ @@ -98,6 +96,30 @@ DECLARE_ACT_CREATOR(VSigmoid); DECLARE_ACT_CREATOR(VTanh); // TODO(TJ): tuning use me +bool VReluCreator::UseMe(const int& d) const { + return platform::MayIUse(platform::avx); +} + +bool VSquareCreator::UseMe(const int& d) const { + return platform::MayIUse(platform::avx); +} + +bool VIdentityCreator::UseMe(const int& d) const { + return platform::MayIUse(platform::avx); +} + +bool VExpCreator::UseMe(const int& d) const { + return platform::MayIUse(platform::avx) && d < 32; +} + +bool VSigmoidCreator::UseMe(const int& d) const { + return platform::MayIUse(platform::avx); +} + +bool VTanhCreator::UseMe(const int& d) const { + return platform::MayIUse(platform::avx); +} + size_t VReluCreator::CodeSize(const int& d) const { return 96 /* init size */ + (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ * diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc index dee6c7b9d3ee9756c1b11d10d55fdca341cbee85..5da24c359edd2df93333fe0ca8a18cdc7385aadb 100644 --- a/paddle/fluid/operators/jit/gen/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -155,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator { class name##Creator : public JitCodeCreator { \ public: \ bool UseMe(const int& attr) const override { \ - return platform::MayIUse(platform::avx); \ + return platform::MayIUse(platform::avx) && attr <= 1024; \ } \ size_t CodeSize(const int& d) const override { \ return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \ diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h index de6b33f467279124d7acd97709516c31706ec4f9..66a97c1be503b0fa983f9a7ec3b61c986774f16b 100644 --- a/paddle/fluid/operators/jit/gen/blas.h +++ b/paddle/fluid/operators/jit/gen/blas.h @@ -61,6 +61,7 @@ class VXXJitCode : public JitCode { base += "_Vec"; } base += (with_relu_ ? "_Relu" : ""); + base += "_D" + std::to_string(num_); return base.c_str(); } void genCode() override; diff --git a/paddle/fluid/operators/jit/gen/hopv.cc b/paddle/fluid/operators/jit/gen/hopv.cc new file mode 100644 index 0000000000000000000000000000000000000000..e7884017198623d996fe98a55691da6e342d656a --- /dev/null +++ b/paddle/fluid/operators/jit/gen/hopv.cc @@ -0,0 +1,103 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/hopv.h" +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void HOPVJitCode::genCode() { + const int num_blocks = num_ / YMM_FLOAT_BLOCK; + int offset = 0; + + if (num_blocks > 0) { + // load one firstly + vmovups(ymm_tmp, ptr[param_src]); + offset += sizeof(float) * YMM_FLOAT_BLOCK; + for (int i = 1; i < num_blocks; ++i) { + vmovups(ymm_src, ptr[param_src + offset]); + process(ymm_tmp, ymm_src, ymm_tmp); + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } + vextractf128(xmm_dst, ymm_tmp, 1); + process(xmm_dst, xmm_dst, xmm_tmp); + } else { + if (type_ == operand_type::MAX) { + vbroadcastss(ymm_dst, ptr[param_src]); + } else if (type_ == operand_type::ADD) { + vxorps(ymm_dst, ymm_dst, ymm_dst); + } + } + + int rest = num_ % YMM_FLOAT_BLOCK; + if (rest >= 4) { + vmovups(xmm_src, ptr[param_src + offset]); + offset += sizeof(float) * 4; + rest -= 4; + process(xmm_dst, xmm_dst, xmm_src); + } + + vpermilps(xmm_tmp, xmm_dst, 16 + 8 + 3); + process(xmm_dst, xmm_dst, xmm_tmp); + + if (rest >= 2) { + vmovq(xmm_src, ptr[param_src + offset]); + offset += sizeof(float) * 2; + rest -= 2; + process(xmm_dst, xmm_dst, xmm_src); + } + + vpermilps(xmm_tmp, xmm_dst, 1); + process(xmm_dst, xmm_dst, xmm_tmp); + + if (rest >= 1) { + vmovss(xmm_src, ptr[param_src + offset]); + process(xmm_dst, xmm_dst, xmm_src); + } + vmovss(ptr[param_dst], xmm_dst); + ret(); +} + +#define DECLARE_HOP_CREATOR(name) \ + class name##Creator : public JitCodeCreator { \ + public: \ + bool UseMe(const int& attr) const override { \ + return platform::MayIUse(platform::avx); \ + } \ + size_t CodeSize(const int& d) const override { \ + return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \ + } \ + std::unique_ptr CreateJitCode(const int& attr) const override { \ + return make_unique(attr, CodeSize(attr)); \ + } \ + } + +DECLARE_HOP_CREATOR(HMax); +DECLARE_HOP_CREATOR(HSum); + +#undef DECLARE_HOP_CREATOR + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(kHMax, gen::HMaxCreator); +REGISTER_JITKERNEL_GEN(kHSum, gen::HSumCreator); diff --git a/paddle/fluid/operators/jit/gen/hopv.h b/paddle/fluid/operators/jit/gen/hopv.h new file mode 100644 index 0000000000000000000000000000000000000000..d3bc94b63d3f962cd655367a2afe1a08582b06fa --- /dev/null +++ b/paddle/fluid/operators/jit/gen/hopv.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +// horizontal operand vector +class HOPVJitCode : public JitCode { + public: + explicit HOPVJitCode(int d, operand_type type, size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), num_(d), type_(type) { + if (!(type_ == operand_type::MAX || type_ == operand_type::ADD)) { + LOG(FATAL) << "Do not support this operand type: " << type_; + } + this->genCode(); + } + + virtual const char* name() const { + std::string base = "VXXJitCode"; + if (type_ == operand_type::MAX) { + base += "_MAX"; + } else { + base += "_SUM"; + } + return base.c_str(); + } + void genCode() override; + + protected: + template + void process(JMM& dst, JMM& src1, JMM& src2) { // NOLINT + if (type_ == operand_type::MAX) { + vmaxps(dst, src1, src2); + } else if (type_ == operand_type::ADD) { + vaddps(dst, src1, src2); + } + } + + private: + int num_; + operand_type type_; + reg64_t param_src{abi_param1}; + reg64_t param_dst{abi_param2}; + reg64_t param_attr{abi_param3}; + + ymm_t ymm_tmp = ymm_t(0); + ymm_t ymm_src = ymm_t(1); + ymm_t ymm_dst = ymm_t(2); + + xmm_t xmm_tmp = xmm_t(0); + xmm_t xmm_src = xmm_t(1); + xmm_t xmm_dst = xmm_t(2); +}; + +#define DECLARE_HOP_JITCODE(name, op_type) \ + class name##JitCode : public HOPVJitCode { \ + public: \ + explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \ + : HOPVJitCode(d, op_type, code_size, code_ptr) {} \ + }; + +DECLARE_HOP_JITCODE(HMax, operand_type::MAX); +DECLARE_HOP_JITCODE(HSum, operand_type::ADD); + +#undef DECLARE_HOP_JITCODE + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index f63d40ad5a559ab87a9b3735406671cfd936d9e4..c388109604bc57e8475e79a6c57eecb5bfebfb52 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -47,6 +47,7 @@ using Label = Xbyak::Label; typedef enum { MUL = 0, + MAX, ADD, SUB, RELU, diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc new file mode 100644 index 0000000000000000000000000000000000000000..ae3858eab20aeb80553d8fcec4088a6632c9c17d --- /dev/null +++ b/paddle/fluid/operators/jit/gen/matmul.cc @@ -0,0 +1,128 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/matmul.h" +#include // offsetof +#include + +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void MatMulJitCode::genCode() { + preCode(); + int block, rest; + const auto groups = packed_groups(n_, k_, &block, &rest); + PADDLE_ENFORCE_GT(groups.front(), 0); + + const int block_len = sizeof(float) * block; + const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1; + const int w_reg_idx = x_reg_idx - 1; + // from packed mov(reg_ptr_wgt, ptr[param_attr + offsetof(matmul_attr_t, + // packed_weight)]); + mov(reg_ptr_wgt, param_y); + size_t z_offset = 0; + size_t wgt_offset = 0; + for (size_t g = 0; g < groups.size(); ++g) { + size_t x_offset = 0; + for (int k = 0; k < k_; ++k) { + vbroadcastss(zmm_t(x_reg_idx), ptr[param_x + x_offset]); + // clean + if (k == 0) { + for (int i = 0; i < groups[g]; ++i) { + vxorps(zmm_t(i), zmm_t(i), zmm_t(i)); + } + } + for (int i = 0; i < groups[g]; ++i) { + vmovups(zmm_t(w_reg_idx), ptr[reg_ptr_wgt + wgt_offset]); + vfmadd231ps(zmm_t(i), zmm_t(w_reg_idx), zmm_t(x_reg_idx)); + wgt_offset += block_len; + } + // last one, save + if (k == k_ - 1) { + for (int i = 0; i < groups[g]; ++i) { + // only rest save should be careful + if (rest != 0 && g == groups.size() - 1 && i == groups[g] - 1) { + break; + } + vmovups(ptr[param_z + z_offset + i * block_len], zmm_t(i)); + } + } + x_offset += sizeof(float); + } + z_offset += block_len * groups[g]; + } + + if (rest != 0) { + // below should refine with mask + int reg_idx = groups.back() - 1; + z_offset = (n_ - rest) * sizeof(float); + int inner_block = 8; + while (rest > 0) { + if (rest >= 8) { + inner_block = 8; + vmovups(ptr[param_z + z_offset], ymm_t(reg_idx)); + // shift zmm of inner_block, change reg_idx if update + } else if (rest >= 4) { + inner_block = 4; + vmovups(ptr[param_z + z_offset], xmm_t(reg_idx)); + } else if (rest >= 2) { + inner_block = 2; + vmovq(ptr[param_z + z_offset], xmm_t(reg_idx)); + } else { + inner_block = 1; + vmovss(ptr[param_z + z_offset], xmm_t(reg_idx)); + } + z_offset += inner_block * sizeof(float); + rest -= inner_block; + } + } + + postCode(); +} + +class MatMulCreator : public JitCodeCreator { + public: + bool UseMe(const matmul_attr_t& attr) const override { + return attr.m == 1 && platform::MayIUse(platform::avx512f) && + attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512; + } + size_t CodeSize(const matmul_attr_t& attr) const override { + int block = YMM_FLOAT_BLOCK; + if (platform::MayIUse(platform::avx512f)) { + block = ZMM_FLOAT_BLOCK; + } + return 96 + 4 * attr.k * (attr.n / block + 1) * 8; + } + std::unique_ptr CreateJitCode( + const matmul_attr_t& attr) const override { + PADDLE_ENFORCE_GT(attr.m, 0); + PADDLE_ENFORCE_GT(attr.n, 0); + PADDLE_ENFORCE_GT(attr.k, 0); + return make_unique(attr, CodeSize(attr)); + } +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator); diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h new file mode 100644 index 0000000000000000000000000000000000000000..626baa8f738bf0395f3c7f1700610d0a9075879b --- /dev/null +++ b/paddle/fluid/operators/jit/gen/matmul.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include // for malloc and free +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +class MatMulJitCode : public JitCode { + public: + explicit MatMulJitCode(const matmul_attr_t& attr, + size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) { + PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet"); + this->genCode(); + } + + virtual const char* name() const { + std::string base = "MatMulJitCode"; + base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" + + std::to_string(k_); + return base.c_str(); + } + void genCode() override; + + private: + int m_, n_, k_; + + reg64_t param_x{abi_param1}; + reg64_t param_y{abi_param2}; + reg64_t param_z{abi_param3}; + reg64_t param_attr{abi_param4}; + reg64_t reg_tmp{rax}; + + reg64_t reg_ptr_wgt{r10}; +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc index 310da0c76f1ab251d788e54f2305f375f3fb4838..3cd5f6554bdc188ce9ea0c0b85c84d032c509600 100644 --- a/paddle/fluid/operators/jit/gen_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -16,6 +16,8 @@ #include #include #include +#include +#include "paddle/fluid/platform/cpu_info.h" DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); @@ -38,6 +40,35 @@ void GenBase::dumpCode(const unsigned char* code) const { } } +std::vector packed_groups(int n, int k, int* block_out, int* rest_out) { + int block; + int max_num_regs; + if (platform::MayIUse(platform::avx512f)) { + block = ZMM_FLOAT_BLOCK; + max_num_regs = 32; + } else { + block = YMM_FLOAT_BLOCK; + max_num_regs = 16; + } + // one for x, one for y, others for z + const int max_used_regs_for_n = max_num_regs - 2; + const int aligned_n = n % block == 0 ? n : (n / block + 1) * block; + const int num_block = aligned_n / block; + const int num_groups = num_block / max_used_regs_for_n; + std::vector groups(num_groups, max_used_regs_for_n); + int rest_num_regs = num_block % max_used_regs_for_n; + if (rest_num_regs != 0) { + groups.push_back(rest_num_regs); + } + if (block_out) { + *block_out = block; + } + if (rest_out) { + *rest_out = n % block; + } + return groups; +} + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h index 4af01a437670aa6a07d370ff23ed2abd369f69a3..d808a332472ae86240cb63356cb417123523366a 100644 --- a/paddle/fluid/operators/jit/gen_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -16,6 +16,7 @@ #include #include // for unique_ptr +#include #include "paddle/fluid/operators/jit/kernel_base.h" DECLARE_bool(dump_jitcode); @@ -67,6 +68,11 @@ class JitCodeCreator : public GenCreator { virtual std::unique_ptr CreateJitCode(const Attr& attr) const = 0; }; +// unify the method of packed groups +// output the packed groups which used in weights, the block size and rest size +std::vector packed_groups(int n, int k, int* block = nullptr, + int* rest = nullptr); + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index 5dbe22a81b4866bdf60a03710d8ffd0b7bcb597b..e7292fe2bd8031aa5bbff68e7c2305a238085bf1 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/operators/jit/helper.h" #include // tolower +#include +#include #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -49,6 +51,9 @@ const char* to_string(KernelType kt) { ONE_CASE(kNCHW16CMulNC); ONE_CASE(kSeqPool); ONE_CASE(kMatMul); + ONE_CASE(kHMax); + ONE_CASE(kHSum); + ONE_CASE(kSoftmax); default: PADDLE_THROW("Not support type: %d, or forget to add it.", kt); return "NOT JITKernel"; @@ -88,6 +93,41 @@ KernelType to_kerneltype(const std::string& act) { return kNone; } +template <> +void pack_weights(const float* src, float* dst, int n, int k) { + int block, rest; + const auto groups = packed_groups(n, k, &block, &rest); + std::for_each(groups.begin(), groups.end(), [&](int i) { + PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0."); + }); + int sum = std::accumulate(groups.begin(), groups.end(), 0); + std::memset(dst, 0, k * sum * block * sizeof(float)); + PADDLE_ENFORCE_GE(sum * block, n, + "The packed n should be equal to or larger than n"); + + const int block_len = sizeof(float) * block; + int n_offset = 0; + + for (size_t g = 0; g < groups.size(); ++g) { + const float* from = src + n_offset; + for (int j = 0; j < k; ++j) { + size_t copy_sz = groups[g] * block_len; + if (g == groups.size() - 1 && rest != 0) { + copy_sz = (groups[g] - 1) * block_len + rest * sizeof(float); + } + std::memcpy(dst, from + j * n, copy_sz); + dst += groups[g] * block; + } + n_offset += groups[g] * block; + } +} + +template +typename std::enable_if::value>::type pack_weights( + const T* src, T* dst, int n, int k) { + PADDLE_THROW("Only support pack with float type."); +} + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index fbf34fc4b3db49596b6be0360c00e77c12fab9b8..d5773d65940127ea0a9b77ed2760bd371b778f4c 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -118,6 +118,35 @@ typename KernelTuples::func_type Get( return GetRefer(); } +template +class KernelFuncs { + public: + KernelFuncs() = default; + static KernelFuncs& Cache() { + static thread_local KernelFuncs g_func_cache; + return g_func_cache; + } + + bool Has(int key) const { return funcs_.find(key) != funcs_.end(); } + + void Insert(int key, typename KernelTuples::func_type func) { + funcs_.emplace(key, func); + } + + typename KernelTuples::func_type At(int key) { + if (Has(key)) { + return funcs_.at(key); + } + auto func = Get(key); + Insert(key, func); + return func; + } + + private: + std::unordered_map funcs_; + DISABLE_COPY_AND_ASSIGN(KernelFuncs); +}; + const char* to_string(KernelType kt); const char* to_string(SeqPoolType kt); @@ -130,17 +159,28 @@ inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) { << (attr.use_peephole ? "True" : "False") << "]"; return os; } + inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) { os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate) << "],act_cand[" << to_string(attr.act_cand) << "]"; return os; } + inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) { os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type[" << to_string(attr.type) << "]"; return os; } +inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) { + os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]"; + return os; +} + +// expose the method to pack matmul weight +template +void pack_weights(const T* src, T* dst, int n, int k); + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index adb101bd5cdf231ac330dbf44beb4c24c1fcf29e..4a8f61146a1921fa1d5f6b7e15af40cd45d31a22 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -20,6 +20,7 @@ namespace paddle { namespace operators { namespace jit { +// TODO(TJ): reorder by alphabet typedef enum { kNone = 0, kVMul = 1, @@ -44,6 +45,9 @@ typedef enum { kNCHW16CMulNC, kSeqPool, kMatMul, + kHSum, // horizontal max + kHMax, // horizontal sum + kSoftmax, } KernelType; typedef enum { @@ -70,6 +74,10 @@ struct XYNTuples { typedef void (*func_type)(const T*, T*, int); }; +// x, return and int +template +struct XRNTuples : public XYNTuples {}; + typedef struct { void* gates; // gates: x_ch, x_ih, x_fh, x_oh const void* ct_1; @@ -137,11 +145,19 @@ struct SeqPoolTuples { typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*); }; +typedef struct matmul_attr_s { + int m, n, k; + void* packed_weight{nullptr}; + matmul_attr_s() = default; + explicit matmul_attr_s(int m_, int n_, int k_, void* packed_weight_ = nullptr) + : m(m_), n(n_), k(k_), packed_weight(packed_weight_) {} +} matmul_attr_t; + template struct MatMulTuples { typedef T data_type; - typedef int attr_type; - typedef void (*func_type)(const T*, const T*, T*, int, int, int); + typedef matmul_attr_t attr_type; + typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*); }; template @@ -159,6 +175,13 @@ struct LayerNormTuples { const float, int); }; +template +struct SoftmaxTuples { + typedef T data_type; + typedef int attr_type; + typedef void (*func_type)(const T*, T*, int, int); +}; + // nChw16c = nChw16c .* NC template struct NCHW16CMulNCTuples { diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index 61de38688664f83775c0c4e5aa6f7e06c3602ddb..1e4a8884e78c5d3c1748988f05ecf461a6f0eb94 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -49,6 +49,13 @@ size_t JitCodeKey(const seq_pool_attr_t& attr) { return (key << pool_type_shift) + static_cast(attr.type); } +template <> +size_t JitCodeKey(const matmul_attr_t& attr) { + size_t key = attr.m; + constexpr int shift = 21; + return (key << shift * 2) + ((static_cast(attr.n)) << shift) + attr.k; +} + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt index e05f204b1eebd03c7a00157d96d0482f4a44a7fb..dd039d29152961210958470a48f086a133ab640c 100644 --- a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt @@ -12,3 +12,4 @@ USE_JITKERNEL_MORE(kLSTMC1H1, mix) USE_JITKERNEL_MORE(kGRUH1, mix) USE_JITKERNEL_MORE(kGRUHtPart1, mix) USE_JITKERNEL_MORE(kGRUHtPart2, mix) +USE_JITKERNEL_MORE(kSoftmax, mix) diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index df0a85256b1f546d5f64be73925cf58b87a25bd7..0036d1c238b17768c4df61af22a85588990e1815 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -48,6 +48,32 @@ void VTanh(const T* x, T* y, int n) { compute_addbias(&b, y, y, n); } +void Softmax(const T* x, T* y, int n, int bs) { + auto compute_hmax = + KernelFuncs, platform::CPUPlace>::Cache().At(n); + auto compute_hsum = + KernelFuncs, platform::CPUPlace>::Cache().At(n); + auto compute_vscal = + KernelFuncs, platform::CPUPlace>::Cache().At(n); + auto compute_vaddbias = + KernelFuncs, platform::CPUPlace>::Cache().At(n); + auto compute_vexp = + KernelFuncs, platform::CPUPlace>::Cache().At(n); + + for (int i = 0; i < bs; ++i) { + T scalar; + compute_hmax(x, &scalar, n); + scalar = static_cast(0) - scalar; + compute_vaddbias(&scalar, x, y, n); // x - max + compute_vexp(y, y, n); + compute_hsum(y, &scalar, n); + scalar = static_cast(1) / scalar; + compute_vscal(&scalar, y, y, n); + x += n; + y += n; + } +} + void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT if (type == kVSigmoid) { return Get, platform::CPUPlace>(d); @@ -184,6 +210,8 @@ bool VSigmoidKernel::UseMe(const int& d) const { return true; } bool VTanhKernel::UseMe(const int& d) const { return true; } +bool SoftmaxKernel::UseMe(const int& d) const { return true; } + bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; } bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; } @@ -207,6 +235,7 @@ namespace mix = paddle::operators::jit::more::mix; REGISTER_MORE_KERNEL(kVSigmoid, VSigmoid); REGISTER_MORE_KERNEL(kVTanh, VTanh); +REGISTER_MORE_KERNEL(kSoftmax, Softmax); REGISTER_MORE_KERNEL(kLSTMCtHt, LSTMCtHt); REGISTER_MORE_KERNEL(kLSTMC1H1, LSTMC1H1); REGISTER_MORE_KERNEL(kGRUH1, GRUH1); diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h index a70ecdf9348f511311307b4c27bb4506222a7439..d64af192197a0b339a39a1862c028875da2f3900 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.h +++ b/paddle/fluid/operators/jit/more/mix/mix.h @@ -26,6 +26,7 @@ using T = float; void VSigmoid(const T* x, T* y, int n); void VTanh(const T* x, T* y, int n); +void Softmax(const T* x, T* y, int n, int bs); void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr); void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr); @@ -45,6 +46,9 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr); DECLARE_MORE_KERNEL(VSigmoid, XYNTuples); DECLARE_MORE_KERNEL(VTanh, XYNTuples); +// XRN +DECLARE_MORE_KERNEL(Softmax, SoftmaxTuples); + DECLARE_MORE_KERNEL(LSTMCtHt, LSTMTuples); DECLARE_MORE_KERNEL(LSTMC1H1, LSTMTuples); diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt index 667c6dfad6676d00ab994564bff57c90caa0cb41..f9e5aea32e7cd48e9b39c4c3ee0e30f4a5c84f6f 100644 --- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt @@ -12,3 +12,4 @@ USE_JITKERNEL_MORE(kVSquare, mkl) USE_JITKERNEL_MORE(kVSigmoid, mkl) USE_JITKERNEL_MORE(kVTanh, mkl) USE_JITKERNEL_MORE(kSeqPool, mkl) +USE_JITKERNEL_MORE(kSoftmax, mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index fccdc68f5efa34bac6f5a34a41569d2f77416284..4c999131ab116ebe3484355158993558b02cc4b2 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -25,17 +25,19 @@ namespace more { namespace mkl { template <> -void MatMul(const float* a, const float* b, float* c, int m, int n, - int k) { - platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, - n, k, 1.f, a, k, b, n, 0.f, c, n); +void MatMul(const float* a, const float* b, float* c, + const matmul_attr_t* attr) { + platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + attr->m, attr->n, attr->k, 1.f, a, attr->k, b, + attr->n, 0.f, c, attr->n); } template <> -void MatMul(const double* a, const double* b, double* c, int m, int n, - int k) { - platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, - n, k, 1.0, a, k, b, n, 0.0, c, n); +void MatMul(const double* a, const double* b, double* c, + const matmul_attr_t* attr) { + platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + attr->m, attr->n, attr->k, 1.0, a, attr->k, b, + attr->n, 0.0, c, attr->n); } template <> @@ -116,12 +118,17 @@ void VAXPY(double a, const double* x, double* y, int n) { platform::dynload::cblas_daxpy(n, a, x, 1, y, 1); } -// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 template <> -bool MatMulKernel::UseMe(const int& d) const { - return platform::MayIUse(platform::avx); +void ASum(const float* x, float* res, int n) { + res[0] = platform::dynload::cblas_sasum(n, x, 1); } +template <> +void ASum(const double* x, double* res, int n) { + res[0] = platform::dynload::cblas_dasum(n, x, 1); +} + +// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 template <> bool VMulKernel::UseMe(const int& d) const { return platform::MayIUse(platform::avx512f) && d > 512; @@ -129,7 +136,7 @@ bool VMulKernel::UseMe(const int& d) const { template <> bool VAddKernel::UseMe(const int& d) const { - return platform::MayIUse(platform::avx512f) && d > 512; + return platform::MayIUse(platform::avx) && d > 512; } template <> @@ -167,13 +174,28 @@ bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { return true; } +template <> +bool MatMulKernel::UseMe(const matmul_attr_t& attr) const { + return platform::MayIUse(platform::avx); +} + +template <> +bool MatMulKernel::UseMe(const matmul_attr_t& attr) const { + return true; +} + +template <> +bool SoftmaxKernel::UseMe(const int& d) const { + // tuned on avx2 + return platform::MayIUse(platform::avx) && d < 60; +} + #define AWALYS_USE_ME_WITH_DOUBLE(func) \ template <> \ bool func##Kernel::UseMe(const int& d) const { \ return true; \ } -AWALYS_USE_ME_WITH_DOUBLE(MatMul); AWALYS_USE_ME_WITH_DOUBLE(VMul); AWALYS_USE_ME_WITH_DOUBLE(VAdd); AWALYS_USE_ME_WITH_DOUBLE(VScal); @@ -181,6 +203,7 @@ AWALYS_USE_ME_WITH_DOUBLE(VExp); AWALYS_USE_ME_WITH_DOUBLE(VSigmoid); AWALYS_USE_ME_WITH_DOUBLE(VTanh); AWALYS_USE_ME_WITH_DOUBLE(VSquare); +AWALYS_USE_ME_WITH_DOUBLE(Softmax); #undef AWALYS_USE_ME_WITH_DOUBLE } // namespace mkl @@ -204,5 +227,6 @@ REGISTER_MKL_KERNEL(kVSquare, VSquare); REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid); REGISTER_MKL_KERNEL(kVTanh, VTanh); REGISTER_MKL_KERNEL(kSeqPool, SeqPool); +REGISTER_MKL_KERNEL(kSoftmax, Softmax); #undef REGISTER_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index a27196fa19f1d3e9aa6c414b6b9f99a21ef49025..8130b87326f1887f232022ab30fa7bf42b0723e7 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -16,6 +16,7 @@ #include #include +#include #include "paddle/fluid/operators/jit/kernel_base.h" namespace paddle { @@ -25,7 +26,7 @@ namespace more { namespace mkl { template -void MatMul(const T* a, const T* b, T* c, int m, int n, int k); +void MatMul(const T* a, const T* b, T* c, const matmul_attr_t* attr); template void VMul(const T* x, const T* y, T* z, int n); @@ -90,6 +91,30 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { } } +template +void ASum(const T* x, T* res, int n); + +template +void Softmax(const T* x, T* y, int n, int bs) { + std::vector entities(bs); + for (int i = 0; i < bs; ++i) { + entities[i] = x[i * n]; + for (int c = 1; c < n; ++c) { + entities[i] = x[i * n + c] > entities[i] ? x[i * n + c] : entities[i]; + } + for (int c = 0; c < n; ++c) { + y[i * n + c] = x[i * n + c] - entities[i]; + } + } + VExp(y, y, n * bs); + for (int i = 0; i < bs; ++i) { + T sum; + ASum(&y[i * n], &sum, n); + sum = static_cast(1) / sum; + VScal(&sum, &y[i * n], &y[i * n], n); + } +} + #define DECLARE_MKL_KERNEL(name, tuples) \ template \ class name##Kernel : public KernelMore> { \ @@ -117,6 +142,8 @@ DECLARE_MKL_KERNEL(VSquare, XYNTuples); DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples); +DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples); + #undef DECLARE_MKL_KERNEL } // namespace mkl diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index 4b9bc5e8d49c62404d5d4ef99b7c50987fcb415a..9f2935828ca300dbdb71b0fefb6b9883cb45e4b0 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -29,3 +29,6 @@ USE_JITKERNEL_REFER(kNCHW16CMulNC) USE_JITKERNEL_REFER(kSeqPool) USE_JITKERNEL_REFER(kMatMul) USE_JITKERNEL_REFER(kVSquare) +USE_JITKERNEL_REFER(kHSum) +USE_JITKERNEL_REFER(kHMax) +USE_JITKERNEL_REFER(kSoftmax) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index 3512ad7fe7921381afb6152330fff6be34de5ad7..b8adb40ec7e1b64df2b04a3201292db235af7b19 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -52,4 +52,9 @@ REGISTER_REFER_KERNEL(kSeqPool, SeqPool); REGISTER_REFER_KERNEL(kMatMul, MatMul); +REGISTER_REFER_KERNEL(kHMax, HMax); +REGISTER_REFER_KERNEL(kHSum, HSum); + +REGISTER_REFER_KERNEL(kSoftmax, Softmax); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 97d029358594d757f0e1874e9c87ecb8f97c9d50..0c4a985f8e8ece0a6169478fa3a9b111f5a6f3b4 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -363,21 +363,57 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { // A(M,K) * B(K,N) = C(M,N) template -void MatMul(const T* A, const T* B, T* C, int M, int N, int K) { +void MatMul(const T* A, const T* B, T* C, const matmul_attr_t* attr) { + int M = attr->m; + int N = attr->n; + int K = attr->k; for (int m = 0; m < M; ++m) { const T* pa = A + m * K; T* pc = C + m * N; for (int n = 0; n < N; ++n) { const T* pb = B + n; - T sum = static_cast(0); - for (int k = 0; k < K; ++k) { - sum += (pa[k] * pb[k * N]); + pc[n] = pa[0] * pb[0]; + for (int k = 1; k < K; ++k) { + pc[n] += pa[k] * pb[k * N]; } - *(pc + n) = sum; } } } +template +void HMax(const T* x, T* res, int n) { + res[0] = x[0]; + for (int i = 1; i < n; ++i) { + res[0] = res[0] < x[i] ? x[i] : res[0]; + } +} + +template +void HSum(const T* x, T* res, int n) { + res[0] = x[0]; + for (int i = 1; i < n; ++i) { + res[0] += x[i]; + } +} + +// y = e^(x - max(x)) +// y = y / sum(y) +template +void Softmax(const T* x, T* y, int n, int bs = 1) { + for (int i = 0; i < bs; ++i) { + T scalar; + HMax(x, &scalar, n); + scalar = static_cast(0) - scalar; + VAddBias(&scalar, x, y, n); // x - max + VExp(y, y, n); + HSum(y, &scalar, n); + scalar = static_cast(1) / scalar; + VScal(&scalar, y, y, n); + x += n; + y += n; + } +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -421,6 +457,11 @@ DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples); DECLARE_REFER_KERNEL(MatMul, MatMulTuples); +DECLARE_REFER_KERNEL(HMax, XRNTuples); +DECLARE_REFER_KERNEL(HSum, XRNTuples); + +DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 68a79b6314e4cf86f5b715b9c6694924126b12da..237e588d35cc3b33658a830db34676967818aab6 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -22,7 +22,7 @@ #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/place.h" -static double acc = 1e-5; +DEFINE_double(acc, 1e-5, "Test accuracy threshold."); template void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), @@ -39,7 +39,7 @@ template void ExpectEQ(const T* target, const T* refer, int n) { if (std::is_floating_point::value) { for (int i = 0; i < n; ++i) { - EXPECT_NEAR(target[i], refer[i], acc); + EXPECT_NEAR(target[i], refer[i], FLAGS_acc); } } else { for (int i = 0; i < n; ++i) { @@ -61,6 +61,7 @@ std::vector TestSizes() { } namespace jit = paddle::operators::jit; +using CPUPlace = paddle::platform::CPUPlace; template struct TestFuncWithRefer { @@ -121,6 +122,40 @@ struct TestFuncWithRefer, T, std::vector, } }; +template +struct TestFuncWithRefer, std::vector, std::vector, + int, int> { + void operator()(const typename jit::SoftmaxTuples::func_type tgt, + const std::vector& x, const std::vector& yref, int n, + int bs) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + EXPECT_EQ(x.size(), static_cast(n * bs)); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + std::vector ytgt(n * bs); + T* ytgt_data = ytgt.data(); + // test normal + tgt(x_data, ytgt_data, n, bs); + ExpectEQ(ytgt_data, yref_data, n * bs); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(ytgt_data, ytgt_data, n, bs); + ExpectEQ(ytgt_data, yref_data, n * bs); + } +}; + +template +struct TestFuncWithRefer, std::vector, T> { + void operator()(const typename jit::XRNTuples::func_type tgt, + const std::vector& x, const T ref_res) { + EXPECT_TRUE(tgt != nullptr); + T tgt_res; + tgt(x.data(), &tgt_res, x.size()); + ExpectEQ(&tgt_res, &ref_res, 1); + } +}; + template struct TestFuncWithRefer, std::vector, std::vector> { void operator()(const typename jit::XYNTuples::func_type tgt, @@ -172,7 +207,7 @@ struct TestFuncWithRefer, std::vector, std::vector, T* ht_data = ht.data(); T* checked_data = checked.data(); - paddle::operators::jit::lstm_t step; + jit::lstm_t step; step.gates = x_data; step.ct_1 = ct_1_data; step.ct = ct_data; @@ -208,7 +243,7 @@ struct TestFuncWithRefer, std::vector, std::vector, const T* ht_ref_data = ht_ref.data(); T* x_data = x.data(); T* ht_data = ht.data(); - paddle::operators::jit::gru_t step; + jit::gru_t step; step.gates = x_data; step.ht_1 = ht_1_data; step.ht = ht_data; @@ -237,26 +272,28 @@ struct TestFuncWithRefer, std::vector, std::vector, template struct TestFuncWithRefer, std::vector, std::vector, - std::vector, int, int, int> { + std::vector, + typename jit::MatMulTuples::attr_type> { void operator()(const typename jit::MatMulTuples::func_type tgt, const std::vector& a, const std::vector& b, - const std::vector& cref, int m, int n, int k) { + const std::vector& cref, + const typename jit::MatMulTuples::attr_type& attr) { EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(a.size(), static_cast(m * k)); - EXPECT_EQ(b.size(), static_cast(k * n)); - EXPECT_EQ(cref.size(), static_cast(m * n)); + EXPECT_EQ(a.size(), static_cast(attr.m * attr.k)); + EXPECT_EQ(b.size(), static_cast(attr.k * attr.n)); + EXPECT_EQ(cref.size(), static_cast(attr.m * attr.n)); std::vector c(cref.size()); const T* a_data = a.data(); const T* b_data = b.data(); const T* cref_data = cref.data(); T* c_data = c.data(); - tgt(a_data, b_data, c_data, m, n, k); - ExpectEQ(c_data, cref_data, m * n); + tgt(a_data, b_data, c_data, &attr); + ExpectEQ(c_data, cref_data, attr.m * attr.n); } }; -template +template void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { TestFuncWithRefer test; // test jitcode @@ -286,9 +323,8 @@ void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { test(tgt, args...); } -template +template void TestXYZNKernel() { - namespace jit = paddle::operators::jit; VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); for (int d : TestSizes()) { auto ref = jit::GetRefer>(); @@ -320,9 +356,8 @@ void TestXYZNKernel() { } } -template +template void TestAXYNKernel() { - namespace jit = paddle::operators::jit; VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); for (int d : TestSizes()) { auto ref = jit::GetRefer>(); @@ -347,9 +382,26 @@ void TestAXYNKernel() { } } -template +template +void TestXRNKernel() { + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + auto last_acc = FLAGS_acc; + FLAGS_acc = 1e-4; + for (int d : TestSizes()) { + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + std::vector x(d); + RandomVec(d, x.data(), -2.f, 2.f); + T ref_res; + ref(x.data(), &ref_res, d); + TestAllImpls, PlaceType, std::vector, T>(d, x, + ref_res); + } + FLAGS_acc = last_acc; +} + +template void TestXYNKernel() { - namespace jit = paddle::operators::jit; VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); for (int d : TestSizes()) { auto ref = jit::GetRefer>(); @@ -373,9 +425,8 @@ void TestXYNKernel() { } } -template +template void TestLSTMKernel() { - namespace jit = paddle::operators::jit; VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; for (int d : TestSizes()) { @@ -424,9 +475,8 @@ void TestLSTMKernel() { } } -template +template void TestGRUKernel() { - namespace jit = paddle::operators::jit; VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; for (int d : TestSizes()) { @@ -459,7 +509,7 @@ void TestGRUKernel() { } } -template +template void TestSeqPoolKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); std::vector pool_types = { @@ -484,12 +534,13 @@ void TestSeqPoolKernel() { } } -template +template void TestMatMulKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - auto last_acc = acc; - // TODO(intel): this should be acc issue of MKL - acc = 1e-3; + auto last_acc = FLAGS_acc; + // TODO(intel): fix MKL acc issue + // https://github.com/PaddlePaddle/Paddle/issues/15447 + FLAGS_acc = 1e-3; for (int m : {1, 2, 3, 4}) { for (int n : {1, 2, 3, 4}) { for (int k : TestSizes()) { @@ -501,16 +552,42 @@ void TestMatMulKernel() { const T* a_data = a.data(); const T* b_data = b.data(); T* c_data = c.data(); - ref(a_data, b_data, c_data, m, n, k); + const jit::matmul_attr_t attr{m, n, k}; + ref(a_data, b_data, c_data, &attr); TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(k, a, b, c, m, n, k); + std::vector, std::vector>(attr, a, b, c, attr); } } } - acc = last_acc; + FLAGS_acc = last_acc; +} + +template +void TestSoftmaxKernel() { + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + for (int bs : {1, 2, 10}) { + for (int n : TestSizes()) { + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + std::vector x(bs * n), y(bs * n); + RandomVec(bs * n, x.data(), -2.f, 2.f); + const T* x_data = x.data(); + T* y_data = y.data(); + + std::vector xinp(x.size()); // inplace test + std::copy(x.begin(), x.end(), xinp.begin()); + ref(x_data, y_data, n, bs); + T* xinp_data = xinp.data(); + ref(xinp_data, xinp_data, n, bs); + ExpectEQ(xinp_data, y_data, n * bs); + + TestAllImpls, PlaceType, std::vector, + std::vector>(n, x, y, n, bs); + } + } } -template +template void TestNCHW16CMulNCKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); const int n = 3, c = 16 * 4, h = 10, w = 10; @@ -565,129 +642,123 @@ void TestNCHW16CMulNCKernel() { // XYZNTuple TEST(JITKernel, kVMul) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); + TestXYZNKernel(); + TestXYZNKernel(); } TEST(JITKernel, kVAdd) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); + TestXYZNKernel(); + TestXYZNKernel(); } TEST(JITKernel, kVAddRelu) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); + TestXYZNKernel(); + TestXYZNKernel(); } TEST(JITKernel, kVSub) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); + TestXYZNKernel(); + TestXYZNKernel(); } // AXYNTuples TEST(JITKernel, kVScal) { - namespace jit = paddle::operators::jit; - TestAXYNKernel(); - TestAXYNKernel(); + TestAXYNKernel(); + TestAXYNKernel(); } TEST(JITKernel, kVAddBias) { - namespace jit = paddle::operators::jit; - TestAXYNKernel(); - TestAXYNKernel(); + TestAXYNKernel(); + TestAXYNKernel(); +} + +// XRNTuples +TEST(JITKernel, kHMax) { + TestXRNKernel(); + TestXRNKernel(); +} + +TEST(JITKernel, kHSum) { + TestXRNKernel(); + TestXRNKernel(); } // XYNTuples TEST(JITKernel, kVRelu) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } TEST(JITKernel, kVIdentity) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } TEST(JITKernel, kVSquare) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } TEST(JITKernel, kVExp) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } TEST(JITKernel, kVSigmoid) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } TEST(JITKernel, kVTanh) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } // LSTM TEST(JITKernel, kLSTMCtHt) { - namespace jit = paddle::operators::jit; - TestLSTMKernel(); - TestLSTMKernel(); + TestLSTMKernel(); + TestLSTMKernel(); } TEST(JITKernel, kLSTMC1H1) { - namespace jit = paddle::operators::jit; - TestLSTMKernel(); - TestLSTMKernel(); + TestLSTMKernel(); + TestLSTMKernel(); } // GRU TEST(JITKernel, kGRUH1) { - namespace jit = paddle::operators::jit; - TestGRUKernel(); - TestGRUKernel(); + TestGRUKernel(); + TestGRUKernel(); } TEST(JITKernel, kGRUHtPart1) { - namespace jit = paddle::operators::jit; - TestGRUKernel(); - TestGRUKernel(); + TestGRUKernel(); + TestGRUKernel(); } TEST(JITKernel, kGRUHtPart2) { - namespace jit = paddle::operators::jit; - TestGRUKernel(); - TestGRUKernel(); + TestGRUKernel(); + TestGRUKernel(); } TEST(JITKernel, kSeqPool) { - namespace jit = paddle::operators::jit; - TestSeqPoolKernel(); - TestSeqPoolKernel(); + TestSeqPoolKernel(); + TestSeqPoolKernel(); } TEST(JITKernel, kMatMul) { - namespace jit = paddle::operators::jit; - TestMatMulKernel(); - TestMatMulKernel(); + TestMatMulKernel(); + TestMatMulKernel(); +} + +TEST(JITKernel, kSoftmax) { + TestSoftmaxKernel(); + TestSoftmaxKernel(); } TEST(JITKernel, kNCHW16CMulNC) { - namespace jit = paddle::operators::jit; - TestNCHW16CMulNCKernel(); - TestNCHW16CMulNCKernel(); + TestNCHW16CMulNCKernel(); + TestNCHW16CMulNCKernel(); } // TODO(yihua/TJ): add crf decoding and layer norm unit tests diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index fd15539f7b6727496988c9b13d0d2551659a420a..0af8b9e69cfe09890f28ef2028baa19319a5c379 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/operators/lookup_table_op.h" #include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { @@ -193,8 +194,11 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel, - ops::LookupTableCUDAKernel); + ops::LookupTableCUDAKernel, + ops::LookupTableCUDAKernel); REGISTER_OP_CUDA_KERNEL(lookup_table_grad, ops::LookupTableGradCUDAKernel, - ops::LookupTableGradCUDAKernel); + ops::LookupTableGradCUDAKernel, + ops::LookupTableGradCUDAKernel); diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 6bbb7155dda9b2c844f793a63adb861c2ed956e8..4b6eef18d8b967af5f3a5df0dee750620e7e412a 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -37,7 +37,7 @@ math_library(concat_and_split) math_library(context_project DEPS im2col math_function) math_library(cross_entropy) math_library(cos_sim_functor) -math_library(depthwise_conv) +math_library(depthwise_conv DEPS cub) math_library(im2col) math_library(sampler) @@ -53,7 +53,7 @@ math_library(sequence2batch) math_library(sequence_padding) math_library(sequence_pooling DEPS math_function jit_kernel_helper) math_library(sequence_scale) -math_library(softmax DEPS math_function) +math_library(softmax DEPS math_function jit_kernel_helper) math_library(beam_search DEPS math_function) math_library(matrix_bit_code) diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc index fb7119273a734feba870fdabade6a4faa1d5e9a3..69971ef7423eff6bc3f8543a491edb6b0bbd00ca 100644 --- a/paddle/fluid/operators/math/beam_search.cc +++ b/paddle/fluid/operators/math/beam_search.cc @@ -29,8 +29,9 @@ class BeamSearchFunctor { const framework::LoDTensor *ids, const framework::LoDTensor *scores, framework::LoDTensor *selected_ids, - framework::LoDTensor *selected_scores, size_t level, - size_t beam_size, int end_id, bool is_accumulated) { + framework::LoDTensor *selected_scores, + framework::Tensor *parent_idx, size_t level, size_t beam_size, + int end_id, bool is_accumulated) { auto abs_lod = framework::ToAbsOffset(scores->lod()); auto &high_level = abs_lod[level]; @@ -57,11 +58,13 @@ class BeamSearchFunctor { std::vector({static_cast(num_instances), 1})); selected_ids->Resize(dims); selected_scores->Resize(dims); + parent_idx->Resize({static_cast(num_instances)}); auto *selected_ids_data = selected_ids->mutable_data(platform::CPUPlace()); auto *selected_scores_data = selected_scores->mutable_data(platform::CPUPlace()); + auto *parent_idx_data = parent_idx->mutable_data(platform::CPUPlace()); // fill in data std::vector low_level; @@ -69,6 +72,7 @@ class BeamSearchFunctor { for (auto &items : selected_items) { low_level.push_back(low_offset); for (auto &item : items) { + parent_idx_data[low_offset] = static_cast(low_level.size() - 1); selected_ids_data[low_offset] = item.id; selected_scores_data[low_offset] = item.score; low_offset++; diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu index d94e3023ce537cb9fa456e079c4fa3cf57fb954d..61d021ef627f1ccd90b992c2078a7f3ca879422d 100644 --- a/paddle/fluid/operators/math/beam_search.cu +++ b/paddle/fluid/operators/math/beam_search.cu @@ -157,10 +157,10 @@ __device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local, } __device__ __forceinline__ void WriteBack( - int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, - Triple* top_beam_local, const int seq_offset_start, - const int seq_offset_end, const int selected_seq_start, - const int selected_seq_length) { + int64_t* selected_ids, float* selected_scores, int* parent_idx, + size_t* selected_offsets, Triple* top_beam_local, + const int seq_offset_start, const int seq_offset_end, + const int selected_seq_start, const int selected_seq_length) { const int tid = threadIdx.x; // use 1 thread only for each sequence int global_index = selected_seq_start; for (int global_offset = seq_offset_start; global_offset < seq_offset_end; @@ -171,6 +171,7 @@ __device__ __forceinline__ void WriteBack( selected_ids[global_index] = static_cast(top_beam_local[local_index].id); selected_scores[global_index] = top_beam_local[local_index].score; + parent_idx[global_index] = static_cast(global_offset); global_index++; } } @@ -180,11 +181,11 @@ __device__ __forceinline__ void WriteBack( template __device__ void BeamSearchDetails( - int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, - const int64_t* pre_ids, const float* pre_scores, const int64_t* ids, - const float* scores, const int seq_offset_start, const int seq_offset_end, - const int seq_width, int beam_size, int end_id, bool is_accumulated, - int num_used_threads) { + int64_t* selected_ids, float* selected_scores, int* parent_idx, + size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores, + const int64_t* ids, const float* scores, const int seq_offset_start, + const int seq_offset_end, const int seq_width, int beam_size, int end_id, + bool is_accumulated, int num_used_threads) { __shared__ Triple top_beam[MaxLength]; int num_items = 0; @@ -228,15 +229,15 @@ __device__ void BeamSearchDetails( selected_offsets[0] = 0; } - WriteBack(selected_ids, selected_scores, selected_offsets, top_beam_local, - seq_offset_start, seq_offset_end, selected_seq_start, - selected_seq_length); + WriteBack(selected_ids, selected_scores, parent_idx, selected_offsets, + top_beam_local, seq_offset_start, seq_offset_end, + selected_seq_start, selected_seq_length); } } template __global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores, - size_t* selected_offsets, + int* parent_idx, size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores, const int64_t* ids, const float* scores, const size_t* seq_offsets, @@ -250,24 +251,25 @@ __global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores, int seq_offset_end = static_cast(seq_offsets[seq_id + 1]); BeamSearchDetails( - selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids, - scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id, - is_accumulated, num_used_threads); + selected_ids, selected_scores, parent_idx, selected_offsets, pre_ids, + pre_scores, ids, scores, seq_offset_start, seq_offset_end, seq_width, + beam_size, end_id, is_accumulated, num_used_threads); } template __global__ void BeamSearchKernelSingle( - int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, - const int64_t* pre_ids, const float* pre_scores, const int64_t* ids, - const float* scores, const int seq_length, const int seq_width, - int beam_size, int end_id, bool is_accumulated, int num_used_threads) { + int64_t* selected_ids, float* selected_scores, int* parent_idx, + size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores, + const int64_t* ids, const float* scores, const int seq_length, + const int seq_width, int beam_size, int end_id, bool is_accumulated, + int num_used_threads) { const int seq_offset_start = 0; const int seq_offset_end = seq_length; BeamSearchDetails( - selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids, - scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id, - is_accumulated, num_used_threads); + selected_ids, selected_scores, parent_idx, selected_offsets, pre_ids, + pre_scores, ids, scores, seq_offset_start, seq_offset_end, seq_width, + beam_size, end_id, is_accumulated, num_used_threads); } static inline int GetNumUsedThreads(const int max_threads_per_seq, @@ -300,8 +302,9 @@ class BeamSearchFunctor { const framework::LoDTensor* ids, const framework::LoDTensor* scores, framework::LoDTensor* selected_ids, - framework::LoDTensor* selected_scores, size_t level, - size_t beam_size, int end_id, bool is_accumulated) { + framework::LoDTensor* selected_scores, + framework::Tensor* parent_idx, size_t level, size_t beam_size, + int end_id, bool is_accumulated) { auto abs_lod = framework::ToAbsOffset(scores->lod()); const int64_t* pre_ids_data = pre_ids->data(); @@ -322,6 +325,8 @@ class BeamSearchFunctor { selected_ids->mutable_data(selected_dims, context.GetPlace()); float* selected_scores_data = selected_scores->mutable_data(selected_dims, context.GetPlace()); + int* parent_idx_data = parent_idx->mutable_data( + {static_cast(num_seqs * beam_size)}, context.GetPlace()); framework::LoD selected_lod(2); selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end()); @@ -339,9 +344,9 @@ class BeamSearchFunctor { CUDA_LAUNCH_KERNEL_HELPER( BeamSearchKernelSingle<<< 1, kMaxThreadsPerSeq, 0, context.stream()>>>( - selected_ids_data, selected_scores_data, selected_offsets, - pre_ids_data, pre_scores_data, ids_data, scores_data, - seq_length, static_cast(seq_width), + selected_ids_data, selected_scores_data, parent_idx_data, + selected_offsets, pre_ids_data, pre_scores_data, ids_data, + scores_data, seq_length, static_cast(seq_width), static_cast(beam_size), static_cast(end_id), is_accumulated, num_used_threads)); } @@ -357,9 +362,9 @@ class BeamSearchFunctor { CUDA_LAUNCH_KERNEL_HELPER( BeamSearchKernel<<< 1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>( - selected_ids_data, selected_scores_data, selected_offsets, - pre_ids_data, pre_scores_data, ids_data, scores_data, - seq_offsets, static_cast(num_seqs), + selected_ids_data, selected_scores_data, parent_idx_data, + selected_offsets, pre_ids_data, pre_scores_data, ids_data, + scores_data, seq_offsets, static_cast(num_seqs), static_cast(seq_width), static_cast(beam_size), end_id, is_accumulated, num_used_threads)); } @@ -379,6 +384,7 @@ class BeamSearchFunctor { {static_cast(selected_lod[1].back()), 1}); selected_ids->Resize(final_selected_dims); selected_scores->Resize(final_selected_dims); + parent_idx->Resize({static_cast(selected_lod[1].back())}); } } }; diff --git a/paddle/fluid/operators/math/beam_search.h b/paddle/fluid/operators/math/beam_search.h index 3cd17f426c5596582c91f2b3f0cc5ba513e3aa4b..4474e7ea52affed792572d02202ec2577c471e50 100644 --- a/paddle/fluid/operators/math/beam_search.h +++ b/paddle/fluid/operators/math/beam_search.h @@ -104,14 +104,12 @@ class BeamSearchFunctor { * Return false if all the input tensor is empty, in machine translation task * that means no candidates is provided, and the task will stop running. */ - void operator()(const DeviceContext& context, - const framework::LoDTensor* pre_ids, - const framework::LoDTensor* pre_scores, - const framework::LoDTensor* ids, - const framework::LoDTensor* scores, - framework::LoDTensor* selected_ids, - framework::LoDTensor* selected_scores, size_t level, - size_t beam_size, int end_id, bool is_accumulated); + void operator()( + const DeviceContext& context, const framework::LoDTensor* pre_ids, + const framework::LoDTensor* pre_scores, const framework::LoDTensor* ids, + const framework::LoDTensor* scores, framework::LoDTensor* selected_ids, + framework::LoDTensor* selected_scores, framework::Tensor* parent_idx, + size_t level, size_t beam_size, int end_id, bool is_accumulated); }; } // namespace math diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc index 1c29ee95f6b109209316e4e8c8f3cda37eac62ae..7ea8eb8b00db328ca13d3d33d751aca4eac66dae 100644 --- a/paddle/fluid/operators/math/beam_search_test.cc +++ b/paddle/fluid/operators/math/beam_search_test.cc @@ -93,13 +93,14 @@ void TestBeamSearch() { paddle::framework::LoDTensor selected_ids; paddle::framework::LoDTensor selected_scores; + paddle::framework::LoDTensor parent_idx; size_t level = 0; size_t beam_size = 2; int end_id = 0; paddle::operators::math::BeamSearchFunctor beamsearch; beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids, - &selected_scores, level, beam_size, end_id, true); + &selected_scores, &parent_idx, level, beam_size, end_id, true); ASSERT_EQ(selected_ids.lod(), selected_scores.lod()); diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index cddd0a18db53a7ddf9ca14d5f373180586ef6a31..0ad57c51be79cd3577b43c9af777bff710308fac 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -30,15 +30,17 @@ inline void FCCompute(const BlasT& blas, const int M, return; } if (relu) { - auto compute = - jit::Get, platform::CPUPlace>(N); + auto compute = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(N); for (int i = 0; i < M; i++) { T* dst = Y + i * N; compute(B, dst, dst, N); } } else { - auto compute = - jit::Get, platform::CPUPlace>(N); + auto compute = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(N); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 1d9d98b10646af9e199f6c481740d30745888707..a1cb3f972826a67721b00ce6df0ec48cc34d6e03 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -16,8 +16,8 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/jit/kernels.h" -#include "paddle/fluid/operators/math/blas.h" namespace paddle { namespace operators { namespace math { @@ -81,28 +81,11 @@ class SoftmaxFunctor> { const int kBatchDim = 0; const int kClassDim = 1; // 2D data. Batch x C - const int batch_size = in_dims[kBatchDim]; - const int num_classes = in_dims[kClassDim]; - std::vector entities(batch_size); - auto blas = math::GetBlas(context); - for (int n = 0; n < batch_size; ++n) { - entities[n] = in_data[n * num_classes]; - for (int c = 1; c < num_classes; ++c) { - entities[n] = in_data[n * num_classes + c] > entities[n] - ? in_data[n * num_classes + c] - : entities[n]; - } - for (int c = 0; c < num_classes; ++c) { - out_data[n * num_classes + c] = - in_data[n * num_classes + c] - entities[n]; - } - } - - blas.VEXP(num_classes * batch_size, out_data, out_data); - for (int n = 0; n < batch_size; ++n) { - auto sum = blas.ASUM(num_classes, &out_data[n * num_classes], 1); - blas.SCAL(num_classes, 1.0f / sum, &out_data[n * num_classes]); - } + auto compute_softmax = + jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(in_dims[kClassDim]); + compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]); } }; diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc similarity index 100% rename from paddle/fluid/operators/activation_mkldnn_op.cc rename to paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc similarity index 100% rename from paddle/fluid/operators/batch_norm_mkldnn_op.cc rename to paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc diff --git a/paddle/fluid/operators/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc similarity index 100% rename from paddle/fluid/operators/concat_mkldnn_op.cc rename to paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc similarity index 100% rename from paddle/fluid/operators/conv_mkldnn_op.cc rename to paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc diff --git a/paddle/fluid/operators/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc similarity index 100% rename from paddle/fluid/operators/conv_transpose_mkldnn_op.cc rename to paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc diff --git a/paddle/fluid/operators/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc similarity index 100% rename from paddle/fluid/operators/dequantize_mkldnn_op.cc rename to paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc diff --git a/paddle/fluid/operators/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc similarity index 100% rename from paddle/fluid/operators/fc_mkldnn_op.cc rename to paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc diff --git a/paddle/fluid/operators/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc similarity index 100% rename from paddle/fluid/operators/gaussian_random_mkldnn_op.cc rename to paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc similarity index 100% rename from paddle/fluid/operators/lrn_mkldnn_op.cc rename to paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc diff --git a/paddle/fluid/operators/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn/mkldnn_activation_op.h similarity index 100% rename from paddle/fluid/operators/mkldnn_activation_op.h rename to paddle/fluid/operators/mkldnn/mkldnn_activation_op.h diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc similarity index 100% rename from paddle/fluid/operators/pool_mkldnn_op.cc rename to paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc diff --git a/paddle/fluid/operators/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc similarity index 100% rename from paddle/fluid/operators/quantize_mkldnn_op.cc rename to paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc similarity index 100% rename from paddle/fluid/operators/softmax_mkldnn_op.cc rename to paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc similarity index 100% rename from paddle/fluid/operators/sum_mkldnn_op.cc rename to paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc diff --git a/paddle/fluid/operators/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc similarity index 100% rename from paddle/fluid/operators/transpose_mkldnn_op.cc rename to paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc index d6e897ed4666261cdd0bd6565f61abb218d971e5..38e65524e870834710ff29f722c69eadf67d9dbe 100644 --- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc +++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc @@ -31,6 +31,9 @@ std::map>>)>> NgraphBridge::NG_NODE_MAP = { + {"accuracy", NG_OPS::BuildAccuracyNode}, + {"conv2d", NG_OPS::BuildConv2dNode}, + {"conv2d_grad", NG_OPS::BuildConv2dGradNode}, {"elementwise_add", NG_OPS::BuildElementwiseAddNode}, {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode}, {"fill_constant", NG_OPS::BuildFillConstantNode}, @@ -38,6 +41,8 @@ std::map +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildAccuracyNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto indices = platform::GetInputNode(op, "Indices", ngb_node_map); + auto label = platform::GetInputNode(op, "Label", ngb_node_map); + auto inference = platform::GetInputNode(op, "Out", ngb_node_map); + auto inference_shape = inference->get_shape(); + size_t num_samples = inference_shape.at(0); + size_t k = inference_shape.at(1); + + std::shared_ptr label_k = label; + if (k > 1) { + auto label_1d = std::make_shared( + label, ngraph::AxisVector{0, 1}, ngraph::Shape{num_samples}); + label_k = std::make_shared(label_1d, inference_shape, + ngraph::AxisSet{1}); + } + + auto node_equal = std::make_shared(indices, label_k); + auto node_eq_int = + std::make_shared(node_equal, ngraph::element::i64); + auto num_correct_0d = + std::make_shared(node_eq_int, ngraph::AxisSet{0, 1}); + std::shared_ptr num_correct = + platform::NgReshaper(num_correct_0d, ngraph::Shape{1}); + std::shared_ptr n_samples = ngraph::op::Constant::create( + ngraph::element::i64, ngraph::Shape{1}, {num_samples}); + std::shared_ptr accuracy = std::make_shared( + std::make_shared(num_correct, ngraph::element::f32), + std::make_shared(n_samples, ngraph::element::f32)); + + platform::SetOutputNode(op, "Accuracy", accuracy, ngb_node_map); + platform::SetOutputNode(op, "Correct", num_correct, ngb_node_map); + platform::SetOutputNode(op, "Total", n_samples, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h similarity index 100% rename from paddle/fluid/operators/ngraph/ops/binary_unnary_op.h rename to paddle/fluid/operators/ngraph/ops/binary_unary_op.h diff --git a/paddle/fluid/operators/ngraph/ops/conv2d_op.h b/paddle/fluid/operators/ngraph/ops/conv2d_op.h new file mode 100644 index 0000000000000000000000000000000000000000..46fb2703f51482afa0546f08b8fc7b2c98e281bc --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/conv2d_op.h @@ -0,0 +1,235 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +std::shared_ptr GroupedConvolution( + const std::shared_ptr& data_batch, + const std::shared_ptr& filters, const ngraph::Strides strides, + const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings, + size_t groups) { + auto& data_shape = data_batch->get_shape(); + auto& filter_shape = filters->get_shape(); + ngraph::NodeVector ng_slices; + + for (size_t i = 0; i < groups; ++i) { + size_t channel_step = filter_shape.at(1); + const std::vector lower_bound{0, i * channel_step, 0, 0}; + const std::vector upper_bound{data_shape.at(0), + (i + 1) * channel_step, + data_shape.at(2), data_shape.at(3)}; + auto data_slice = std::make_shared( + data_batch, lower_bound, upper_bound); + + size_t filter_step = filter_shape.at(0) / groups; + const std::vector filter_lower_bound{i * filter_step, 0, 0, 0}; + const std::vector filter_upper_bound{ + (i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2), + filter_shape.at(3)}; + auto filter_slice = std::make_shared( + filters, filter_lower_bound, filter_upper_bound); + auto ng_conv = std::make_shared( + data_slice, filter_slice, strides, dilations, paddings, paddings); + ng_slices.push_back(ng_conv); + } + + size_t concat_axis = 1; + return std::make_shared(ng_slices, concat_axis); +} + +std::shared_ptr GroupedGradConvolutionFilter( + const std::shared_ptr& data_batch, + const std::shared_ptr& filters, + const std::shared_ptr& doutput, const ngraph::Strides strides, + const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings, + size_t groups) { + auto& data_shape = data_batch->get_shape(); + auto& filter_shape = filters->get_shape(); + auto& out_shape = doutput->get_shape(); + ngraph::NodeVector ng_slices; + + for (size_t i = 0; i < groups; ++i) { + size_t channel_step = filter_shape.at(1); + const std::vector lower_bound{0, i * channel_step, 0, 0}; + const std::vector upper_bound{data_shape.at(0), + (i + 1) * channel_step, + data_shape.at(2), data_shape.at(3)}; + auto data_slice = std::make_shared( + data_batch, lower_bound, upper_bound); + + size_t filter_step = data_shape.at(0); + + const std::vector filter_lower_bound{i * filter_step, 0, 0, 0}; + const std::vector filter_upper_bound{ + (i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2), + filter_shape.at(3)}; + auto filter_slice = std::make_shared( + filters, filter_lower_bound, filter_upper_bound); + + const std::vector olower_bound{0, i * filter_step, 0, 0}; + const std::vector oupper_bound{out_shape.at(0), + (i + 1) * filter_step, + out_shape.at(2), out_shape.at(3)}; + auto out_slice = std::make_shared(doutput, olower_bound, + oupper_bound); + + auto ng_conv = std::make_shared( + data_slice, filter_slice->get_shape(), out_slice, strides, dilations, + paddings, paddings, ngraph::Strides{1, 1}); + + ng_slices.push_back(ng_conv); + } + + size_t concat_axis = 0; + return std::make_shared(ng_slices, concat_axis); +} + +std::shared_ptr GroupedGradConvolutionData( + const std::shared_ptr& data_batch, + const std::shared_ptr& filters, + const std::shared_ptr& doutput, const ngraph::Strides strides, + const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings, + size_t groups) { + auto& data_shape = data_batch->get_shape(); + auto& filter_shape = filters->get_shape(); + auto& out_shape = doutput->get_shape(); + ngraph::NodeVector ng_slices; + + for (size_t i = 0; i < groups; ++i) { + size_t channel_step = filter_shape.at(1); + const std::vector lower_bound{0, i * channel_step, 0, 0}; + const std::vector upper_bound{data_shape.at(0), + (i + 1) * channel_step, + data_shape.at(2), data_shape.at(3)}; + auto data_slice = std::make_shared( + data_batch, lower_bound, upper_bound); + + size_t filter_step = data_shape.at(0); + + const std::vector filter_lower_bound{i * filter_step, 0, 0, 0}; + const std::vector filter_upper_bound{ + (i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2), + filter_shape.at(3)}; + auto filter_slice = std::make_shared( + filters, filter_lower_bound, filter_upper_bound); + + const std::vector olower_bound{0, i * filter_step, 0, 0}; + const std::vector oupper_bound{out_shape.at(0), + (i + 1) * filter_step, + out_shape.at(2), out_shape.at(3)}; + auto out_slice = std::make_shared(doutput, olower_bound, + oupper_bound); + + auto ng_conv = std::make_shared( + data_slice->get_shape(), filter_slice, out_slice, strides, dilations, + paddings, paddings, ngraph::Strides{1, 1}); + ng_slices.push_back(ng_conv); + } + + size_t concat_axis = 1; + return std::make_shared(ng_slices, concat_axis); +} + +void BuildConv2dNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + auto filters = paddle::platform::GetInputNode(op, "Filter", ngb_node_map); + auto input = paddle::platform::GetInputNode(op, "Input", ngb_node_map); + + std::vector strides = op_attrs.Get>("strides"); + std::vector paddings = op_attrs.Get>("paddings"); + std::vector dilations = op_attrs.Get>("dilations"); + + const ngraph::Strides ng_strides{static_cast(strides.at(0)), + static_cast(strides.at(1))}; + const ngraph::Strides ng_dilations{static_cast(dilations.at(0)), + static_cast(dilations.at(1))}; + const ngraph::CoordinateDiff ng_paddings{ + static_cast(paddings.at(0)), + static_cast(paddings.at(1))}; + + int groups = static_cast(op_attrs.Get("groups")); + PADDLE_ENFORCE_GE(groups, 1, "conv groups needs be no less than 1"); + + std::shared_ptr result; + if (groups == 1) { + result = std::make_shared( + input, filters, ng_strides, ng_dilations, ng_paddings, ng_paddings); + } else { + result = GroupedConvolution(input, filters, ng_strides, ng_dilations, + ng_paddings, groups); + } + paddle::platform::SetOutputNode(op, "Output", result, ngb_node_map); +} + +void BuildConv2dGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + auto filter = paddle::platform::GetInputNode(op, "Filter", ngb_node_map); + auto input = paddle::platform::GetInputNode(op, "Input", ngb_node_map); + auto doutput = + paddle::platform::GetInputNode(op, "Output@GRAD", ngb_node_map); + + int groups = op_attrs.Get("groups"); + std::vector strides = op_attrs.Get>("strides"); + std::vector paddings = op_attrs.Get>("paddings"); + std::vector dilations = op_attrs.Get>("dilations"); + + const ngraph::Strides ng_strides{static_cast(strides.at(0)), + static_cast(strides.at(1))}; + const ngraph::Strides ng_dilations{static_cast(dilations.at(0)), + static_cast(dilations.at(1))}; + const ngraph::CoordinateDiff ng_paddings{ + static_cast(paddings.at(0)), + static_cast(paddings.at(1))}; + + std::shared_ptr dfilter; + std::shared_ptr dinput; + if (groups == 1) { + dfilter = std::make_shared( + input, filter->get_shape(), doutput, ng_strides, ng_dilations, + ng_paddings, ng_paddings, ngraph::Strides{1, 1}); + + dinput = std::make_shared( + input->get_shape(), filter, doutput, ng_strides, ng_dilations, + ng_paddings, ng_paddings, ngraph::Strides{1, 1}); + + } else { + dfilter = GroupedGradConvolutionFilter(input, filter, doutput, ng_strides, + ng_dilations, ng_paddings, groups); + dinput = GroupedGradConvolutionData(input, filter, doutput, ng_strides, + ng_dilations, ng_paddings, groups); + } + + paddle::platform::SetOutputNode(op, "Filter@GRAD", dfilter, ngb_node_map); + paddle::platform::SetOutputNode(op, "Input@GRAD", dinput, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ops/pool2d_op.h b/paddle/fluid/operators/ngraph/ops/pool2d_op.h new file mode 100644 index 0000000000000000000000000000000000000000..836c9d6c185b305d3dd4c9e9d30e23abb0c1431c --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/pool2d_op.h @@ -0,0 +1,174 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildPool2dNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto x_shape = x->get_shape(); + + std::string pooling_type = op_attrs.Get("pooling_type"); + std::vector ksize = op_attrs.Get>("ksize"); + std::vector strides = op_attrs.Get>("strides"); + std::vector paddings = op_attrs.Get>("paddings"); + + PADDLE_ENFORCE_EQ(x_shape.size() - 2, ksize.size(), + "Handling 2d pooling only"); + + if (op_attrs.Get("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(x_shape.at(i + 2)); + } + } + + ngraph::Shape ng_padding_below{static_cast(paddings.at(0)), + static_cast(paddings.at(1))}; + ngraph::Shape ng_padding_above{static_cast(paddings.at(0)), + static_cast(paddings.at(1))}; + ngraph::Shape ng_ksize_shape{static_cast(ksize.at(0)), + static_cast(ksize.at(1))}; + ngraph::Strides ng_strides{static_cast(strides.at(0)), + static_cast(strides.at(1))}; + + auto ComputeCeiledOutput = [](size_t in, size_t k, size_t p, size_t s) { + return (in - k + 2 * p) / s + 1; + }; + + if (op_attrs.Get("ceil_mode")) { + auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map); + auto dummpy_shape = dummy_out->get_shape(); + for (size_t i = 0; i < ng_padding_above.size(); ++i) { + auto desired_size = ComputeCeiledOutput(x_shape[i + 2], ksize[i], + paddings[i], strides[i]); + if (desired_size != dummpy_shape[i + 2]) { + ng_padding_above[i] += strides[i]; + } + } + } + + bool padding_exclusive = op_attrs.Get("exclusive"); + if (pooling_type == "max") { + auto pool2d = std::make_shared( + x, ng_ksize_shape, ng_strides, ng_padding_below, ng_padding_above); + paddle::platform::SetOutputNode(op, "Out", pool2d, ngb_node_map); + } else if (pooling_type == "avg") { + std::shared_ptr pool2d; + if (op_attrs.Get("adaptive")) { + auto ComputeAdaptive = [](size_t in, size_t k) { + return std::floor(in / k); + }; + ng_strides[0] = x_shape.size() == 4 + ? ComputeAdaptive(x_shape[3], ksize[0]) + : ng_strides[0]; + ng_strides[1] = x_shape.size() == 4 + ? ComputeAdaptive(x_shape[3], ksize[0]) + : ng_strides[1]; + pool2d = + std::make_shared(x, ng_ksize_shape, ng_strides); + } else { + pool2d = std::make_shared( + x, ng_ksize_shape, ng_strides, ng_padding_below, ng_padding_above, + !padding_exclusive); + } + paddle::platform::SetOutputNode(op, "Out", pool2d, ngb_node_map); + } else { + PADDLE_THROW("Support max and avg pooling only"); + } +} + +void BuildPool2dGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map); + auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map); + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto x_shape = x->get_shape(); + + std::string pooling_type = op_attrs.Get("pooling_type"); + std::vector ksize = op_attrs.Get>("ksize"); + std::vector strides = op_attrs.Get>("strides"); + std::vector paddings = op_attrs.Get>("paddings"); + + PADDLE_ENFORCE_EQ(x_shape.size() - 2, ksize.size(), + "Handling 2d pooling only"); + + if (op_attrs.Get("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(x_shape.at(i + 2)); + } + } + + ngraph::Shape ng_padding_below{static_cast(paddings.at(0)), + static_cast(paddings.at(1))}; + ngraph::Shape ng_padding_above{static_cast(paddings.at(0)), + static_cast(paddings.at(1))}; + ngraph::Shape ng_ksize_shape{static_cast(ksize.at(0)), + static_cast(ksize.at(1))}; + ngraph::Strides ng_strides{static_cast(strides.at(0)), + static_cast(strides.at(1))}; + + bool padding_exclusive = op_attrs.Get("exclusive"); + if (pooling_type == "max") { + auto pool2d_grad = std::make_shared( + x, dout, out, ng_ksize_shape, ng_strides, ng_padding_below, + ng_padding_above); + paddle::platform::SetOutputNode(op, "X@GRAD", pool2d_grad, ngb_node_map); + } else if (pooling_type == "avg") { + std::shared_ptr pool2d_grad; + if (op_attrs.Get("adaptive")) { + auto ComputeAdaptive = [](size_t in, size_t k) { + return std::floor(in / k); + }; + ng_strides[0] = x_shape.size() == 4 + ? ComputeAdaptive(x_shape[3], ksize[0]) + : ng_strides[0]; + ng_strides[1] = x_shape.size() == 4 + ? ComputeAdaptive(x_shape[3], ksize[0]) + : ng_strides[1]; + pool2d_grad = std::make_shared( + x->get_shape(), dout, ng_ksize_shape, ng_strides, ng_padding_below, + ng_padding_above, !padding_exclusive); + } else { + pool2d_grad = std::make_shared( + x->get_shape(), dout, ng_ksize_shape, ng_strides, ng_padding_below, + ng_padding_above, !padding_exclusive); + } + paddle::platform::SetOutputNode(op, "X@GRAD", pool2d_grad, ngb_node_map); + } else { + PADDLE_THROW("Support max and avg pooling only"); + } +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ops/top_k_op.h b/paddle/fluid/operators/ngraph/ops/top_k_op.h index ea66953a125860ab1ce8309819b6c433ff32eaaa..852ecd7139a3c7046e78265ca021b2ce286c63c0 100644 --- a/paddle/fluid/operators/ngraph/ops/top_k_op.h +++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h @@ -36,11 +36,6 @@ void BuildTopKNode( std::make_shared(top_k, 0); std::shared_ptr out = std::make_shared(top_k, 1); - auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map); - if (dummy_out && dummy_out->get_element_type() != out->get_element_type()) { - out = std::make_shared(out, - dummy_out->get_element_type()); - } paddle::platform::SetOutputNode(op, "Indices", indices, ngb_node_map); paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map); } diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h index 6c95d3f3bf3a3b0448a8f39915f8b025f7d3bd46..f81cbc2c733af2a42f27e2ecb05ee2f8e2f8c17b 100644 --- a/paddle/fluid/operators/norm_op.h +++ b/paddle/fluid/operators/norm_op.h @@ -99,10 +99,10 @@ class NormGradKernel : public framework::OpKernel { auto dx_e = framework::EigenVector::Flatten(*out_dx); Eigen::DSizes shape(pre, n, post); - Eigen::DSizes norm_shape(pre, post); + Eigen::DSizes rshape(pre, 1, post); auto x = x_e.reshape(shape); auto dy = dy_e.reshape(shape); - auto norm = norm_e.reshape(norm_shape); + auto norm = norm_e.reshape(rshape); auto dx = dx_e.reshape(shape); framework::Tensor rsum; @@ -111,7 +111,6 @@ class NormGradKernel : public framework::OpKernel { Eigen::DSizes rdim(1); Eigen::DSizes bcast(1, n, 1); - Eigen::DSizes rshape(pre, 1, post); // dx = ( dy/sqrt(sum(x*x)) ) * [1 - x*sum(x) / (sum(x*x) + e)] // = [dy - dy * x * sum(x) / (sum(x*x) + e)] / sqrt(sum(x*x)) diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 5399ae556e7f38a551d680704d8d825e2fdba88a..fc3636e0b24765f681d3260b07fe854309774a40 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -259,7 +259,7 @@ Example: W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1 $$ - For exclusive = true: + For exclusive = false: $$ hstart = i * strides[0] - paddings[0] hend = hstart + ksize[0] @@ -267,7 +267,7 @@ Example: wend = wstart + ksize[1] Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]} $$ - For exclusive = false: + For exclusive = true: $$ hstart = max(0, i * strides[0] - paddings[0]) hend = min(H, hstart + ksize[0]) @@ -403,7 +403,7 @@ Example: H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\ W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1 $$ - For exclusive = true: + For exclusive = false: $$ dstart = i * strides[0] - paddings[0] dend = dstart + ksize[0] @@ -413,7 +413,7 @@ Example: wend = wstart + ksize[2] Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]} $$ - For exclusive = false: + For exclusive = true: $$ dstart = max(0, i * strides[0] - paddings[0]) dend = min(D, dstart + ksize[0]) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index f08798794a2f9fc042800583cbc032d6f12bf3dc..43a49de52242b96aade91013e89228fcb3247302 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -213,7 +213,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr reader, framework::LoD lod{lod_data}; lod_tensor.set_lod(lod); int64_t* tensor_data = lod_tensor.mutable_data( - framework::make_ddim({1, static_cast(batch_feasign.size())}), + framework::make_ddim({static_cast(batch_feasign.size()), 1}), platform::CPUPlace()); memcpy(tensor_data, batch_feasign.data(), batch_feasign.size() * sizeof(int64_t)); @@ -223,7 +223,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr reader, // insert label tensor framework::LoDTensor label_tensor; auto* label_tensor_data = label_tensor.mutable_data( - framework::make_ddim({1, static_cast(batch_label.size())}), + framework::make_ddim({static_cast(batch_label.size()), 1}), platform::CPUPlace()); memcpy(label_tensor_data, batch_label.data(), batch_label.size() * sizeof(int64_t)); diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 9f3a254c84d4e04fbcd449644a7e138eff520fbc..6410439816d8ae4a9d1df507819071ce76b5308e 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -123,7 +123,7 @@ TEST(CTR_READER, read_data) { std::vector>> data_slot_6003{b1, b2, b3, b4}; - std::vector label_dims = {{1, 3}, {1, 3}, {1, 3}, {1, 1}}; + std::vector label_dims = {{3, 1}, {3, 1}, {3, 1}, {1, 1}}; LoDTensorBlockingQueueHolder queue_holder; int capacity = 64; diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt index 5fe4d15ae2c6254a50318813c852b6c314880aba..ebd07d90ebe6b0ba008ac89c01c4f054f96a6da9 100644 --- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt +++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt @@ -1,5 +1,9 @@ include(operators) -register_operators() +if(WITH_GPU) + register_operators(DEPS cub) +else() + register_operators() +endif() if(WITH_GPU) file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.part.cu") diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 8eab3a6f891f1dfa91c5ce316f1419df2cd42248..eda54f76b898cdf893347d31cadb86dea892a4ce 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -327,13 +327,45 @@ class Reshape2GradOp : public framework::OperatorWithKernel { } }; +class ReshapeOpInplaceInToOut : public framework::InplaceInToOut { + public: + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map inplace_in_to_out = { + {"X", "Out"}, + }; + return inplace_in_to_out; + } +}; + +class ReshapeGradInplaceInToOut : public framework::InplaceInToOut { + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map inplace_in_to_out = { + {framework::GradVarName("Out"), framework::GradVarName("X")}, + }; + return inplace_in_to_out; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp); + paddle::framework::DefaultGradOpDescMaker, + ops::ReshapeOpInplaceInToOut); +REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp, + ops::ReshapeGradInplaceInToOut); REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int, ops::ReshapeKernel, int64_t, ops::ReshapeKernel); @@ -343,8 +375,9 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, ops::ReshapeGradKernel); REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker, - ops::Reshape2GradMaker); -REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp); + ops::Reshape2GradMaker, ops::ReshapeOpInplaceInToOut); +REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp, + ops::ReshapeGradInplaceInToOut); REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int, ops::ReshapeKernel, int64_t, ops::ReshapeKernel); @@ -356,16 +389,20 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel, #ifdef PADDLE_WITH_CUDA REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int, ops::ReshapeKernel, - int64_t, ops::ReshapeKernel); + int64_t, ops::ReshapeKernel, plat::float16, + ops::ReshapeKernel); REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, double, ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, int64_t, + ops::ReshapeGradKernel, plat::float16, ops::ReshapeGradKernel); REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int, ops::ReshapeKernel, - int64_t, ops::ReshapeKernel); + int64_t, ops::ReshapeKernel, plat::float16, + ops::ReshapeKernel); REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel, double, ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, int64_t, + ops::ReshapeGradKernel, plat::float16, ops::ReshapeGradKernel); #endif diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index 981969d2aaa684731a615ec64ca7f7718b35cf09..4ea77ed30db212b694f2050952655dd1a42215bd 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -100,13 +100,14 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker { } }; +using ScaleOpInplace = framework::SingleOpInplaceInToOut; } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker, - ops::ScaleOpVarTypeInference); + ops::ScaleOpVarTypeInference, ops::ScaleOpInplace); REGISTER_OP_CPU_KERNEL( scale, ops::ScaleKernel, ops::ScaleKernel, diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index 789e61b2d332b9391ef45a8ebe58ad0f1a4d2bf0..94995fc99612adb1164e60f1a51747f74eacfb73 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -54,6 +54,9 @@ class SliceOp : public framework::OperatorWithKernel { out_dims[axes[i]] = end - start; } ctx->SetOutputDim("Out", out_dims); + if (axes[0] != 0) { + ctx->ShareLoD("Input", /*->*/ "Out"); + } } protected: diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index bc889a5a042a27838ba6ba0fccb187ec11b5f0c5..8fbf299a7c056aff3bfd4cbd3e3cc28fd3c6ccf2 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -198,6 +198,21 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker { return std::unique_ptr(op); } }; + +class SoftmaxInplaceInToOut : public framework::InplaceInToOut { + public: + using framework::InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + return std::unordered_map{ + {"X", "Out"}, + }; + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu index bf2a9e5b3d22996e688621727cb280dc9aed7859..24d0b2f906a8e0b360c3f477c9290ebe5d57a3ff 100644 --- a/paddle/fluid/operators/stack_op.cu +++ b/paddle/fluid/operators/stack_op.cu @@ -17,13 +17,16 @@ namespace plat = paddle::platform; namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(stack, ops::StackKernel, - ops::StackKernel, - ops::StackKernel, - ops::StackKernel); +REGISTER_OP_CUDA_KERNEL( + stack, ops::StackKernel, + ops::StackKernel, + ops::StackKernel, + ops::StackKernel, + ops::StackKernel); -REGISTER_OP_CUDA_KERNEL(stack_grad, - ops::StackGradKernel, - ops::StackGradKernel, - ops::StackGradKernel, - ops::StackGradKernel); +REGISTER_OP_CUDA_KERNEL( + stack_grad, ops::StackGradKernel, + ops::StackGradKernel, + ops::StackGradKernel, + ops::StackGradKernel, + ops::StackGradKernel); diff --git a/paddle/fluid/operators/transpose_op.cu.cc b/paddle/fluid/operators/transpose_op.cu.cc index b4025350fa9f3610bde43eee91cd059f3063813f..915774e5f3624f26dbd1451a99d7bf0bf75a72c8 100644 --- a/paddle/fluid/operators/transpose_op.cu.cc +++ b/paddle/fluid/operators/transpose_op.cu.cc @@ -15,19 +15,27 @@ limitations under the License. */ #include "paddle/fluid/operators/transpose_op.h" namespace ops = paddle::operators; +namespace plat = paddle::platform; + REGISTER_OP_CUDA_KERNEL( transpose, ops::TransposeKernel, - ops::TransposeKernel); + ops::TransposeKernel, + ops::TransposeKernel); REGISTER_OP_CUDA_KERNEL( transpose_grad, ops::TransposeGradKernel, - ops::TransposeGradKernel); + ops::TransposeGradKernel, + ops::TransposeGradKernel); REGISTER_OP_CUDA_KERNEL( transpose2, ops::TransposeKernel, - ops::TransposeKernel); + ops::TransposeKernel, + ops::TransposeKernel); REGISTER_OP_CUDA_KERNEL( transpose2_grad, ops::TransposeGradKernel, - ops::TransposeGradKernel); + ops::TransposeGradKernel, + ops::TransposeGradKernel); diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h deleted file mode 100644 index 0bb285722ddedf721d98237760ec9868e2134442..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/yolov3_loss_op.h +++ /dev/null @@ -1,483 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -template -using EigenTensor = framework::EigenTensor; -template -using EigenVector = framework::EigenVector; - -using Array5 = Eigen::DSizes; - -template -static inline bool isZero(T x) { - return fabs(x) < 1e-6; -} - -template -static inline T sigmoid(T x) { - return 1.0 / (exp(-1.0 * x) + 1.0); -} - -template -static inline T CalcMaskPointNum(const Tensor& mask) { - auto mask_t = EigenVector::Flatten(mask); - T count = 0.0; - for (int i = 0; i < mask_t.dimensions()[0]; i++) { - if (mask_t(i)) { - count += 1.0; - } - } - return count; -} - -template -static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y, - const Tensor& mask) { - auto x_t = EigenVector::Flatten(x); - auto y_t = EigenVector::Flatten(y); - auto mask_t = EigenVector::Flatten(mask); - - T error_sum = 0.0; - T points = 0.0; - for (int i = 0; i < x_t.dimensions()[0]; i++) { - if (mask_t(i)) { - error_sum += pow(x_t(i) - y_t(i), 2); - points += 1; - } - } - return (error_sum / points); -} - -template -static void CalcMSEGradWithMask(Tensor* grad, const Tensor& x, const Tensor& y, - const Tensor& mask, T mf) { - auto grad_t = EigenVector::Flatten(*grad).setConstant(0.0); - auto x_t = EigenVector::Flatten(x); - auto y_t = EigenVector::Flatten(y); - auto mask_t = EigenVector::Flatten(mask); - - for (int i = 0; i < x_t.dimensions()[0]; i++) { - if (mask_t(i)) { - grad_t(i) = 2.0 * (x_t(i) - y_t(i)) / mf; - } - } -} - -template -static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y, - const Tensor& mask) { - auto x_t = EigenVector::Flatten(x); - auto y_t = EigenVector::Flatten(y); - auto mask_t = EigenVector::Flatten(mask); - - T error_sum = 0.0; - T points = 0.0; - for (int i = 0; i < x_t.dimensions()[0]; i++) { - if (mask_t(i)) { - error_sum += - -1.0 * (y_t(i) * log(x_t(i)) + (1.0 - y_t(i)) * log(1.0 - x_t(i))); - points += 1; - } - } - return (error_sum / points); -} - -template -static inline void CalcBCEGradWithMask(Tensor* grad, const Tensor& x, - const Tensor& y, const Tensor& mask, - T mf) { - auto grad_t = EigenVector::Flatten(*grad).setConstant(0.0); - auto x_t = EigenVector::Flatten(x); - auto y_t = EigenVector::Flatten(y); - auto mask_t = EigenVector::Flatten(mask); - - for (int i = 0; i < x_t.dimensions()[0]; i++) { - if (mask_t(i)) { - grad_t(i) = ((1.0 - y_t(i)) / (1.0 - x_t(i)) - y_t(i) / x_t(i)) / mf; - } - } -} - -template -static void CalcPredResult(const Tensor& input, Tensor* pred_conf, - Tensor* pred_class, Tensor* pred_x, Tensor* pred_y, - Tensor* pred_w, Tensor* pred_h, const int anchor_num, - const int class_num) { - const int n = input.dims()[0]; - const int h = input.dims()[2]; - const int w = input.dims()[3]; - const int box_attr_num = 5 + class_num; - - auto input_t = EigenTensor::From(input); - auto pred_conf_t = EigenTensor::From(*pred_conf); - auto pred_class_t = EigenTensor::From(*pred_class); - auto pred_x_t = EigenTensor::From(*pred_x); - auto pred_y_t = EigenTensor::From(*pred_y); - auto pred_w_t = EigenTensor::From(*pred_w); - auto pred_h_t = EigenTensor::From(*pred_h); - - for (int i = 0; i < n; i++) { - for (int an_idx = 0; an_idx < anchor_num; an_idx++) { - for (int j = 0; j < h; j++) { - for (int k = 0; k < w; k++) { - pred_x_t(i, an_idx, j, k) = - sigmoid(input_t(i, box_attr_num * an_idx, j, k)); - pred_y_t(i, an_idx, j, k) = - sigmoid(input_t(i, box_attr_num * an_idx + 1, j, k)); - pred_w_t(i, an_idx, j, k) = - input_t(i, box_attr_num * an_idx + 2, j, k); - pred_h_t(i, an_idx, j, k) = - input_t(i, box_attr_num * an_idx + 3, j, k); - - pred_conf_t(i, an_idx, j, k) = - sigmoid(input_t(i, box_attr_num * an_idx + 4, j, k)); - - for (int c = 0; c < class_num; c++) { - pred_class_t(i, an_idx, j, k, c) = - sigmoid(input_t(i, box_attr_num * an_idx + 5 + c, j, k)); - } - } - } - } - } -} - -template -static T CalcBoxIoU(std::vector box1, std::vector box2) { - T b1_x1 = box1[0] - box1[2] / 2; - T b1_x2 = box1[0] + box1[2] / 2; - T b1_y1 = box1[1] - box1[3] / 2; - T b1_y2 = box1[1] + box1[3] / 2; - T b2_x1 = box2[0] - box2[2] / 2; - T b2_x2 = box2[0] + box2[2] / 2; - T b2_y1 = box2[1] - box2[3] / 2; - T b2_y2 = box2[1] + box2[3] / 2; - - T b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1); - T b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1); - - T inter_rect_x1 = std::max(b1_x1, b2_x1); - T inter_rect_y1 = std::max(b1_y1, b2_y1); - T inter_rect_x2 = std::min(b1_x2, b2_x2); - T inter_rect_y2 = std::min(b1_y2, b2_y2); - T inter_area = std::max(inter_rect_x2 - inter_rect_x1, static_cast(0.0)) * - std::max(inter_rect_y2 - inter_rect_y1, static_cast(0.0)); - - return inter_area / (b1_area + b2_area - inter_area); -} - -template -static void PreProcessGTBox(const Tensor& gt_box, const Tensor& gt_label, - const float ignore_thresh, std::vector anchors, - const int grid_size, Tensor* obj_mask, - Tensor* noobj_mask, Tensor* tx, Tensor* ty, - Tensor* tw, Tensor* th, Tensor* tconf, - Tensor* tclass) { - const int n = gt_box.dims()[0]; - const int b = gt_box.dims()[1]; - const int anchor_num = anchors.size() / 2; - auto gt_box_t = EigenTensor::From(gt_box); - auto gt_label_t = EigenTensor::From(gt_label); - auto obj_mask_t = EigenTensor::From(*obj_mask).setConstant(0); - auto noobj_mask_t = EigenTensor::From(*noobj_mask).setConstant(1); - auto tx_t = EigenTensor::From(*tx).setConstant(0.0); - auto ty_t = EigenTensor::From(*ty).setConstant(0.0); - auto tw_t = EigenTensor::From(*tw).setConstant(0.0); - auto th_t = EigenTensor::From(*th).setConstant(0.0); - auto tconf_t = EigenTensor::From(*tconf).setConstant(0.0); - auto tclass_t = EigenTensor::From(*tclass).setConstant(0.0); - - for (int i = 0; i < n; i++) { - for (int j = 0; j < b; j++) { - if (isZero(gt_box_t(i, j, 0)) && isZero(gt_box_t(i, j, 1)) && - isZero(gt_box_t(i, j, 2)) && isZero(gt_box_t(i, j, 3))) { - continue; - } - - int cur_label = gt_label_t(i, j); - T gx = gt_box_t(i, j, 0) * grid_size; - T gy = gt_box_t(i, j, 1) * grid_size; - T gw = gt_box_t(i, j, 2) * grid_size; - T gh = gt_box_t(i, j, 3) * grid_size; - int gi = static_cast(gx); - int gj = static_cast(gy); - - T max_iou = static_cast(0); - T iou; - int best_an_index = -1; - std::vector gt_box_shape({0, 0, gw, gh}); - for (int an_idx = 0; an_idx < anchor_num; an_idx++) { - std::vector anchor_shape({0, 0, static_cast(anchors[2 * an_idx]), - static_cast(anchors[2 * an_idx + 1])}); - iou = CalcBoxIoU(gt_box_shape, anchor_shape); - if (iou > max_iou) { - max_iou = iou; - best_an_index = an_idx; - } - if (iou > ignore_thresh) { - noobj_mask_t(i, an_idx, gj, gi) = 0; - } - } - obj_mask_t(i, best_an_index, gj, gi) = 1; - noobj_mask_t(i, best_an_index, gj, gi) = 0; - tx_t(i, best_an_index, gj, gi) = gx - gi; - ty_t(i, best_an_index, gj, gi) = gy - gj; - tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]); - th_t(i, best_an_index, gj, gi) = log(gh / anchors[2 * best_an_index + 1]); - tclass_t(i, best_an_index, gj, gi, cur_label) = 1; - tconf_t(i, best_an_index, gj, gi) = 1; - } - } -} - -static void ExpandObjMaskByClassNum(Tensor* obj_mask_expand, - const Tensor& obj_mask) { - const int n = obj_mask_expand->dims()[0]; - const int an_num = obj_mask_expand->dims()[1]; - const int h = obj_mask_expand->dims()[2]; - const int w = obj_mask_expand->dims()[3]; - const int class_num = obj_mask_expand->dims()[4]; - auto obj_mask_expand_t = EigenTensor::From(*obj_mask_expand); - auto obj_mask_t = EigenTensor::From(obj_mask); - - obj_mask_expand_t = obj_mask_t.reshape(Array5(n, an_num, h, w, 1)) - .broadcast(Array5(1, 1, 1, 1, class_num)); -} - -template -static void AddAllGradToInputGrad( - Tensor* grad, T loss, const Tensor& pred_x, const Tensor& pred_y, - const Tensor& pred_conf, const Tensor& pred_class, const Tensor& grad_x, - const Tensor& grad_y, const Tensor& grad_w, const Tensor& grad_h, - const Tensor& grad_conf_target, const Tensor& grad_conf_notarget, - const Tensor& grad_class, const int class_num, const float loss_weight_xy, - const float loss_weight_wh, const float loss_weight_conf_target, - const float loss_weight_conf_notarget, const float loss_weight_class) { - const int n = pred_x.dims()[0]; - const int an_num = pred_x.dims()[1]; - const int h = pred_x.dims()[2]; - const int w = pred_x.dims()[3]; - const int attr_num = class_num + 5; - auto grad_t = EigenTensor::From(*grad).setConstant(0.0); - auto pred_x_t = EigenTensor::From(pred_x); - auto pred_y_t = EigenTensor::From(pred_y); - auto pred_conf_t = EigenTensor::From(pred_conf); - auto pred_class_t = EigenTensor::From(pred_class); - auto grad_x_t = EigenTensor::From(grad_x); - auto grad_y_t = EigenTensor::From(grad_y); - auto grad_w_t = EigenTensor::From(grad_w); - auto grad_h_t = EigenTensor::From(grad_h); - auto grad_conf_target_t = EigenTensor::From(grad_conf_target); - auto grad_conf_notarget_t = EigenTensor::From(grad_conf_notarget); - auto grad_class_t = EigenTensor::From(grad_class); - - for (int i = 0; i < n; i++) { - for (int j = 0; j < an_num; j++) { - for (int k = 0; k < h; k++) { - for (int l = 0; l < w; l++) { - grad_t(i, j * attr_num, k, l) = - grad_x_t(i, j, k, l) * pred_x_t(i, j, k, l) * - (1.0 - pred_x_t(i, j, k, l)) * loss * loss_weight_xy; - grad_t(i, j * attr_num + 1, k, l) = - grad_y_t(i, j, k, l) * pred_y_t(i, j, k, l) * - (1.0 - pred_y_t(i, j, k, l)) * loss * loss_weight_xy; - grad_t(i, j * attr_num + 2, k, l) = - grad_w_t(i, j, k, l) * loss * loss_weight_wh; - grad_t(i, j * attr_num + 3, k, l) = - grad_h_t(i, j, k, l) * loss * loss_weight_wh; - grad_t(i, j * attr_num + 4, k, l) = - grad_conf_target_t(i, j, k, l) * pred_conf_t(i, j, k, l) * - (1.0 - pred_conf_t(i, j, k, l)) * loss * loss_weight_conf_target; - grad_t(i, j * attr_num + 4, k, l) += - grad_conf_notarget_t(i, j, k, l) * pred_conf_t(i, j, k, l) * - (1.0 - pred_conf_t(i, j, k, l)) * loss * - loss_weight_conf_notarget; - - for (int c = 0; c < class_num; c++) { - grad_t(i, j * attr_num + 5 + c, k, l) = - grad_class_t(i, j, k, l, c) * pred_class_t(i, j, k, l, c) * - (1.0 - pred_class_t(i, j, k, l, c)) * loss * loss_weight_class; - } - } - } - } - } -} - -template -class Yolov3LossKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* gt_box = ctx.Input("GTBox"); - auto* gt_label = ctx.Input("GTLabel"); - auto* loss = ctx.Output("Loss"); - auto anchors = ctx.Attr>("anchors"); - int class_num = ctx.Attr("class_num"); - float ignore_thresh = ctx.Attr("ignore_thresh"); - float loss_weight_xy = ctx.Attr("loss_weight_xy"); - float loss_weight_wh = ctx.Attr("loss_weight_wh"); - float loss_weight_conf_target = ctx.Attr("loss_weight_conf_target"); - float loss_weight_conf_notarget = - ctx.Attr("loss_weight_conf_notarget"); - float loss_weight_class = ctx.Attr("loss_weight_class"); - - const int n = input->dims()[0]; - const int h = input->dims()[2]; - const int w = input->dims()[3]; - const int an_num = anchors.size() / 2; - - Tensor pred_x, pred_y, pred_w, pred_h; - Tensor pred_conf, pred_class; - pred_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - pred_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - pred_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - pred_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - pred_conf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - pred_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - CalcPredResult(*input, &pred_conf, &pred_class, &pred_x, &pred_y, - &pred_w, &pred_h, an_num, class_num); - - Tensor obj_mask, noobj_mask; - Tensor tx, ty, tw, th, tconf, tclass; - obj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - noobj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - tx.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - ty.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - tw.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - PreProcessGTBox(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask, - &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); - - Tensor obj_mask_expand; - obj_mask_expand.mutable_data({n, an_num, h, w, class_num}, - ctx.GetPlace()); - ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask); - - T loss_x = CalcMSEWithMask(pred_x, tx, obj_mask); - T loss_y = CalcMSEWithMask(pred_y, ty, obj_mask); - T loss_w = CalcMSEWithMask(pred_w, tw, obj_mask); - T loss_h = CalcMSEWithMask(pred_h, th, obj_mask); - T loss_conf_target = CalcBCEWithMask(pred_conf, tconf, obj_mask); - T loss_conf_notarget = CalcBCEWithMask(pred_conf, tconf, noobj_mask); - T loss_class = CalcBCEWithMask(pred_class, tclass, obj_mask_expand); - - auto* loss_data = loss->mutable_data({1}, ctx.GetPlace()); - loss_data[0] = loss_weight_xy * (loss_x + loss_y) + - loss_weight_wh * (loss_w + loss_h) + - loss_weight_conf_target * loss_conf_target + - loss_weight_conf_notarget * loss_conf_notarget + - loss_weight_class * loss_class; - } -}; - -template -class Yolov3LossGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* gt_box = ctx.Input("GTBox"); - auto* gt_label = ctx.Input("GTLabel"); - auto anchors = ctx.Attr>("anchors"); - int class_num = ctx.Attr("class_num"); - float ignore_thresh = ctx.Attr("ignore_thresh"); - auto* input_grad = ctx.Output(framework::GradVarName("X")); - auto* output_grad = ctx.Input(framework::GradVarName("Loss")); - const T loss = output_grad->data()[0]; - float loss_weight_xy = ctx.Attr("loss_weight_xy"); - float loss_weight_wh = ctx.Attr("loss_weight_wh"); - float loss_weight_conf_target = ctx.Attr("loss_weight_conf_target"); - float loss_weight_conf_notarget = - ctx.Attr("loss_weight_conf_notarget"); - float loss_weight_class = ctx.Attr("loss_weight_class"); - - const int n = input->dims()[0]; - const int c = input->dims()[1]; - const int h = input->dims()[2]; - const int w = input->dims()[3]; - const int an_num = anchors.size() / 2; - - Tensor pred_x, pred_y, pred_w, pred_h; - Tensor pred_conf, pred_class; - pred_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - pred_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - pred_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - pred_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - pred_conf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - pred_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - CalcPredResult(*input, &pred_conf, &pred_class, &pred_x, &pred_y, - &pred_w, &pred_h, an_num, class_num); - - Tensor obj_mask, noobj_mask; - Tensor tx, ty, tw, th, tconf, tclass; - obj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - noobj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - tx.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - ty.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - tw.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - PreProcessGTBox(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask, - &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); - - Tensor obj_mask_expand; - obj_mask_expand.mutable_data({n, an_num, h, w, class_num}, - ctx.GetPlace()); - ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask); - - Tensor grad_x, grad_y, grad_w, grad_h; - Tensor grad_conf_target, grad_conf_notarget, grad_class; - grad_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - grad_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - grad_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - grad_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - grad_conf_target.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - grad_conf_notarget.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - grad_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - T obj_mf = CalcMaskPointNum(obj_mask); - T noobj_mf = CalcMaskPointNum(noobj_mask); - T obj_expand_mf = CalcMaskPointNum(obj_mask_expand); - CalcMSEGradWithMask(&grad_x, pred_x, tx, obj_mask, obj_mf); - CalcMSEGradWithMask(&grad_y, pred_y, ty, obj_mask, obj_mf); - CalcMSEGradWithMask(&grad_w, pred_w, tw, obj_mask, obj_mf); - CalcMSEGradWithMask(&grad_h, pred_h, th, obj_mask, obj_mf); - CalcBCEGradWithMask(&grad_conf_target, pred_conf, tconf, obj_mask, - obj_mf); - CalcBCEGradWithMask(&grad_conf_notarget, pred_conf, tconf, noobj_mask, - noobj_mf); - CalcBCEGradWithMask(&grad_class, pred_class, tclass, obj_mask_expand, - obj_expand_mf); - - input_grad->mutable_data({n, c, h, w}, ctx.GetPlace()); - AddAllGradToInputGrad( - input_grad, loss, pred_x, pred_y, pred_conf, pred_class, grad_x, grad_y, - grad_w, grad_h, grad_conf_target, grad_conf_notarget, grad_class, - class_num, loss_weight_xy, loss_weight_wh, loss_weight_conf_target, - loss_weight_conf_notarget, loss_weight_class); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 1f51b5bab3068cc89bffa85de28a9438359659f3..fbb2ac3fe8c5de9b0be593df225677c6a7a89e9c 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,4 +1,4 @@ -proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto) +proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool) py_proto_compile(profiler_py_proto SRCS profiler.proto) add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) @@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce) -cc_library(place SRCS place.cc DEPS enforce boost) +cc_library(place SRCS place.cc DEPS enforce boost lib_any) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) add_subdirectory(dynload) diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h index 2ce8f141d3c51661305f4952479cf2889fc4f396..31b6c38d613cf9df8fa7e8f6a8e1cfa310280968 100644 --- a/paddle/fluid/platform/cuda_device_function.h +++ b/paddle/fluid/platform/cuda_device_function.h @@ -53,10 +53,12 @@ inline static int RoundToPowerOfTwo(int dim) { __VA_ARGS__; \ } break -#define CUDA_LAUNCH_KERNEL_HELPER(...) \ - CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ +#define CUDA_LAUNCH_KERNEL_HELPER(...) \ + CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); template diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index d0619293acf2d2df0d925e969bdeb8e45cda6e2b..a260cda49138580b209e647af459e9392d9f18f1 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -70,6 +70,8 @@ extern void* mklml_dso_handle; __macro(cblas_ddot); \ __macro(cblas_sasum); \ __macro(cblas_dasum); \ + __macro(cblas_isamax); \ + __macro(cblas_idamax); \ __macro(cblas_sscal); \ __macro(cblas_dscal); \ __macro(vsAdd); \ diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 15413785bab3c0fd77244141e8f1840ca0cc1356..142d38f0609d963ce3ff45c595b8432b0e5edd21 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -71,9 +71,8 @@ struct EnforceNotMet : public std::exception { } } - template - EnforceNotMet(const char* f, int l, ARGS... args) { - Init(string::Sprintf(args...), f, l); + EnforceNotMet(const std::string& str, const char* f, int l) { + Init(str, f, l); } const char* what() const noexcept override { return err_str_.c_str(); } @@ -142,28 +141,23 @@ struct EOFException : public std::exception { inline bool is_error(bool stat) { return !stat; } -template -inline typename std::enable_if::type throw_on_error( - bool stat, const Args&... args) { +inline void throw_on_error(bool stat, const std::string& msg) { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(string::Sprintf(args...)); + throw std::runtime_error(msg); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << msg; #endif } #ifdef PADDLE_WITH_CUDA -inline bool is_error(cudaError_t e) { return UNLIKELY(e); } +inline bool is_error(cudaError_t e) { return e != cudaSuccess; } -template -inline typename std::enable_if::type throw_on_error( - cudaError_t e, const Args&... args) { +inline void throw_on_error(cudaError_t e, const std::string& msg) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(e, thrust::cuda_category(), - string::Sprintf(args...)); + throw thrust::system_error(e, thrust::cuda_category(), msg); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << msg; #endif } @@ -171,14 +165,12 @@ inline bool is_error(curandStatus_t stat) { return stat != CURAND_STATUS_SUCCESS; } -template -inline typename std::enable_if::type throw_on_error( - curandStatus_t stat, const Args&... args) { +inline void throw_on_error(curandStatus_t stat, const std::string& msg) { #ifndef REPLACE_ENFORCE_GLOG throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), - string::Sprintf(args...)); + msg); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << msg; #endif } @@ -186,14 +178,11 @@ inline bool is_error(cudnnStatus_t stat) { return stat != CUDNN_STATUS_SUCCESS; } -template -inline typename std::enable_if::type throw_on_error( - cudnnStatus_t stat, const Args&... args) { +inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + - string::Sprintf(args...)); + throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + msg); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << platform::dynload::cudnnGetErrorString(stat) << msg; #endif } @@ -201,9 +190,7 @@ inline bool is_error(cublasStatus_t stat) { return stat != CUBLAS_STATUS_SUCCESS; } -template -inline typename std::enable_if::type throw_on_error( - cublasStatus_t stat, const Args&... args) { +inline void throw_on_error(cublasStatus_t stat, const std::string& msg) { std::string err; if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { err = "CUBLAS: not initialized, "; @@ -225,87 +212,45 @@ inline typename std::enable_if::type throw_on_error( err = "CUBLAS: license error, "; } #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(err + string::Sprintf(args...)); + throw std::runtime_error(err + msg); #else - LOG(FATAL) << err << string::Sprintf(args...); + LOG(FATAL) << err << msg; #endif } #if !defined(__APPLE__) && !defined(_WIN32) -template -inline typename std::enable_if::type throw_on_error( - ncclResult_t stat, const Args&... args) { - if (stat == ncclSuccess) { - return; - } else { +inline bool is_error(ncclResult_t nccl_result) { + return nccl_result != ncclSuccess; +} + +inline void throw_on_error(ncclResult_t stat, const std::string& msg) { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) + - string::Sprintf(args...)); + throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) + msg); #else - LOG(FATAL) << platform::dynload::ncclGetErrorString(stat) - << string::Sprintf(args...); + LOG(FATAL) << platform::dynload::ncclGetErrorString(stat) << msg; #endif - } } #endif // __APPLE__ and windows #endif // PADDLE_WITH_CUDA -template -inline void throw_on_error(T e) { - throw_on_error(e, ""); -} - -#define PADDLE_THROW(...) \ - throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__) - -#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_; - -#define __THROW_ON_ERROR_ONE_ARG(COND, ARG) \ - ::paddle::platform::throw_on_error(COND, ::paddle::string::Sprintf(ARG)); - -#ifdef _WIN32 -#define __PADDLE_THROW_ON_ERROR(COND, ...) \ - __THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__) -#else // _WIN32 -#define __PADDLE_THROW_ON_ERROR(COND, ...) \ - __PADDLE_THROW_ERROR_I( \ - __VA_ARGS__, ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - __THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__)) -#endif // _WIN32 - -#define __PADDLE_UNARY_COMPARE(COND, ...) \ - do { \ - auto __cond = COND; \ - if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ - __PADDLE_THROW_ON_ERROR(__cond, __VA_ARGS__); \ - } \ +#define PADDLE_THROW(...) \ + throw ::paddle::platform::EnforceNotMet( \ + ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__) + +#define PADDLE_ENFORCE(COND, ...) \ + do { \ + auto __cond__ = (COND); \ + if (UNLIKELY(::paddle::platform::is_error(__cond__))) { \ + try { \ + ::paddle::platform::throw_on_error( \ + __cond__, ::paddle::string::Sprintf(__VA_ARGS__)); \ + } catch (...) { \ + throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ + __FILE__, __LINE__); \ + } \ + } \ } while (0) -#ifndef REPLACE_ENFORCE_GLOG -#define __PADDLE_ENFORCE_I(COND, ...) \ - do { \ - try { \ - __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); \ - } catch (...) { \ - throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ - __FILE__, __LINE__); \ - } \ - } while (0) - -#else -#define __PADDLE_ENFORCE_I(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); -#endif // REPLACE_ENFORCE_GLOG - -#define __PADDLE_ENFORCE(__args) __PADDLE_ENFORCE_I __args -#define PADDLE_ENFORCE(...) __PADDLE_ENFORCE((__VA_ARGS__)) - #define PADDLE_THROW_EOF() \ do { \ throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 8df8e32098697540f02d488c873f5ae7fb29828e..6ae21ee8294bedc388f837aad3e20a2b9aca98a2 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -64,7 +64,7 @@ class NCCLGroupGuard { } inline ~NCCLGroupGuard() { - CHECK_EQ(dynload::ncclGroupEnd(), ncclSuccess); + PADDLE_ENFORCE(dynload::ncclGroupEnd()); NCCLMutex().unlock(); } }; diff --git a/paddle/fluid/platform/ngraph_helper.h b/paddle/fluid/platform/ngraph_helper.h index c5b65d6636945b85603c07aeae0290ef9cadb396..b84315995a9d8a65668f57eef67f6dab8c20f9b3 100644 --- a/paddle/fluid/platform/ngraph_helper.h +++ b/paddle/fluid/platform/ngraph_helper.h @@ -43,13 +43,14 @@ std::shared_ptr NgReshaper(std::shared_ptr input, std::shared_ptr GetNode( const std::shared_ptr& op, - const std::string prm, const paddle::framework::VariableNameMap& var_map, + const std::string name, const paddle::framework::VariableNameMap& var_map, std::shared_ptr< std::unordered_map>> ngb_node_map) { - auto& var_names = var_map.at(prm); + auto& var_names = var_map.at(name); PADDLE_ENFORCE_EQ(var_names.size(), 1, - "op %s prm %s expects one associated var", op->Type(), prm); + "op %s name %s expects one associated var", op->Type(), + name); if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) { return (*ngb_node_map)[var_names[0]]; } else { @@ -59,43 +60,53 @@ std::shared_ptr GetNode( std::shared_ptr GetInputNode( const std::shared_ptr& op, - const std::string prm, + const std::string name, std::shared_ptr< std::unordered_map>> ngb_node_map) { - return GetNode(op, prm, op->Inputs(), ngb_node_map); + return GetNode(op, name, op->Inputs(), ngb_node_map); } std::shared_ptr GetOutputNode( const std::shared_ptr& op, - const std::string prm, + const std::string name, std::shared_ptr< std::unordered_map>> ngb_node_map) { - return GetNode(op, prm, op->Outputs(), ngb_node_map); + return GetNode(op, name, op->Outputs(), ngb_node_map); } void SetOutputNode( const std::shared_ptr& op, - const std::string prm, std::shared_ptr node, + const std::string name, std::shared_ptr node, std::shared_ptr< std::unordered_map>> ngb_node_map) { - auto& var_names = op->Outputs().at(prm); + auto& var_names = op->Outputs().at(name); if (var_names.size() == 1) { + /* */ + auto dummy_out = GetOutputNode(op, name, ngb_node_map); + if (dummy_out && dummy_out->get_shape() != node->get_shape()) { + node = NgReshaper(node, dummy_out->get_shape()); + } + if (dummy_out && + dummy_out->get_element_type() != node->get_element_type()) { + node = std::make_shared( + node, dummy_out->get_element_type()); + } (*ngb_node_map)[var_names[0]] = node; } else if (var_names.size() == 0) { (*ngb_node_map)[""] = node; } else { - PADDLE_THROW("prm %s has more than 1 var_names.", prm); + PADDLE_THROW("name %s has more than 1 var_names.", name); } } bool HasOutput(const std::shared_ptr& op, - const std::string prm) { + const std::string name) { auto& outputs = op->Outputs(); - if (outputs.find(prm) == outputs.end()) return false; - return outputs.at(prm).size() > 0; + if (outputs.find(name) == outputs.end()) return false; + return outputs.at(name).size() > 0; } inline void GetMidDims(const ngraph::Shape& x_shape, diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 803ea6b26087884ad79c6bf80238953a012eaddc..4ac5b83c56b114f4e3e4c78710716adc636ebe1d 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -26,5 +26,5 @@ if(WITH_PYTHON) get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(paddle_pybind ${os_dependency_modules}) - cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python) + cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python pybind) endif(WITH_PYTHON) diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index e05667d2c7e9ce5c64cfacee4919cd36d7383c0c..39e47be606c07ed216c9fe2ff8fa75552b8b7c76 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -33,7 +33,6 @@ using paddle::PaddlePredictor; using paddle::NativeConfig; using paddle::NativePaddlePredictor; using paddle::AnalysisPredictor; -using paddle::contrib::AnalysisConfig; static void BindPaddleDType(py::module *m); static void BindPaddleBuf(py::module *m); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 97e5bbaaccaf7c702a324abd708a314c72ece004..6549229e05de5f2a809b56775d9788bbf8e5c1ae 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -37,6 +37,7 @@ limitations under the License. */ #include "paddle/fluid/framework/version.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" +#include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" @@ -127,6 +128,13 @@ PYBIND11_MODULE(core, m) { m.add_object("_cleanup", py::capsule([]() { ScopePool::Instance().Clear(); })); + m.def("get_mem_usage", [](int device) { + return memory::allocation::GPUMemMonitor.GetMemUsage(device); + }); + + m.def("print_mem_usage", + []() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); }); + py::class_(m, "VarBase", R"DOC()DOC") // .def(py::init<>()) .def(py::init(), py::arg("stop_gradient") = false) @@ -1088,6 +1096,10 @@ All parameter, weight, gradient are variables in Paddle. "memory_early_delete", [](const BuildStrategy &self) { return self.memory_early_delete_; }, [](BuildStrategy &self, bool b) { self.memory_early_delete_ = b; }) + .def_property( + "enable_inplace", + [](const BuildStrategy &self) { return self.enable_inplace_; }, + [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; }) .def("_finalize_strategy_and_create_passes", [](BuildStrategy &self) -> std::shared_ptr { return self.CreatePassesFromStrategy(true); diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h index 0b94b60018aac3a61edfda4d7ecb762e9fe70673..16bb3771f2e9bcc07028ef2039fed8691f9aab97 100644 --- a/paddle/fluid/string/printf.h +++ b/paddle/fluid/string/printf.h @@ -84,6 +84,8 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) { tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...)); } +inline std::string Sprintf() { return ""; } + template std::string Sprintf(const Args&... args) { std::ostringstream oss; diff --git a/paddle/scripts/fast_install.sh b/paddle/scripts/fast_install.sh new file mode 100644 index 0000000000000000000000000000000000000000..b960d0f00a26196c827053c41a3b35b97e7cdb07 --- /dev/null +++ b/paddle/scripts/fast_install.sh @@ -0,0 +1,923 @@ +#!/bin/bash + +path='http://paddlepaddle.org/download?url=' +#release_version=`curl -s https://pypi.org/project/paddlepaddle/|grep -E "/project/paddlepaddle/"|grep "release"|awk -F '/' '{print $(NF-1)}'|head -1` +release_version=1.2.0 +python_list=( +"27" +"35" +"36" +"37" +) + + +function use_cpu(){ + while true + do + read -p "是否安装CPU版本的PaddlePaddle?(y/n)" cpu_option + cpu_option=`echo $cpu_option | tr 'A-Z' 'a-z'` + if [[ "$cpu_option" == "" || "$cpu_option" == "n" ]];then + echo "退出安装中..." + exit + else + GPU='cpu' + echo "将为您安装CPU版本的PaddlePaddle" + break + fi + done +} + +function checkLinuxCUDNN(){ + echo + read -n1 -p "请按回车键进行下一步..." + echo + while true + do + version_file='/usr/local/cuda/include/cudnn.h' + if [ -f "$version_file" ];then + CUDNN=`cat $version_file | grep CUDNN_MAJOR |awk 'NR==1{print $NF}'` + fi + if [ "$CUDNN" == "" ];then + version_file=`sudo find /usr -name "cudnn.h"|head -1` + if [ "$version_file" != "" ];then + CUDNN=`cat ${version_file} | grep CUDNN_MAJOR -A 2|awk 'NR==1{print $NF}'` + else + echo "检测结果:未在常规路径下找到cuda/include/cudnn.h文件" + while true + do + read -p "请核实cudnn.h位置,并在此输入路径(请注意,路径需要输入到“cudnn.h”这一级):" cudnn_version + echo + if [ "$cudnn_version" == "" ] || [ ! -f "$cudnn_version" ];then + read -p "仍未找到cuDNN,输入y将安装CPU版本的PaddlePaddle,输入n可重新录入cuDNN路径,请输入(y/n)" cpu_option + echo + cpu_option=`echo $cpu_option | tr 'A-Z' 'a-z'` + if [ "$cpu_option" == "y" -o "$cpu_option" == "" ];then + GPU='cpu' + break + else + echo "请重新输入" + echo + fi + else + CUDNN=`cat $cudnn_version | grep CUDNN_MAJOR |awk 'NR==1{print $NF}'` + echo "检测结果:找到cudnn.h" + break + fi + done + if [ "$GPU" == "cpu" ];then + break + fi + fi + fi + if [ "$CUDA" == "9" -a "$CUDNN" != "7" ];then + echo + echo "目前CUDA9下仅支持cuDNN7,暂不支持您机器上的CUDNN${CUDNN}。您可以访问NVIDIA官网下载适合版本的CUDNN,请ctrl+c退出安装进程。按回车键将为您安装CPU版本的PaddlePaddle" + echo + use_cpu() + if [ "$GPU"=="cpu" ];then + break + fi + fi + + if [ "$CUDNN" == 5 ] || [ "$CUDNN" == 7 ];then + echo + echo "您的CUDNN版本是: CUDNN$CUDNN" + break + else + echo + read -n1 -p "目前支持的CUDNN版本为5和7,暂不支持您机器上的CUDNN${CUDNN},将为您安装CPU版本的PaddlePaddle,请按回车键开始安装" + echo + use_cpu + if [ "$GPU"=="cpu" ];then + break + fi + fi + done +} + +function checkLinuxCUDA(){ + while true + do + CUDA=`echo ${CUDA_VERSION}|awk -F "[ .]" '{print $1}'` + if [ "$CUDA" == "" ];then + if [ -f "/usr/local/cuda/version.txt" ];then + CUDA=`cat /usr/local/cuda/version.txt | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'` + tmp_cuda=$CUDA + fi + if [ -f "/usr/local/cuda8/version.txt" ];then + CUDA=`cat /usr/local/cuda8/version.txt | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'` + tmp_cuda8=$CUDA + fi + if [ -f "/usr/local/cuda9/version.txt" ];then + CUDA=`cat /usr/local/cuda9/version.txt | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'` + tmp_cuda9=$CUDA + fi + fi + + if [ "$tmp_cuda" != "" ];then + echo "检测结果:找到CUDA $tmp_cuda" + fi + if [ "$tmp_cudai8" != "" ];then + echo "检测结果:找到CUDA $tmp_cuda8" + fi + if [ "$tmp_cuda9" != "" ];then + echo "检测结果:找到CUDA $tmp_cuda9" + fi + + if [ "$CUDA" == "" ];then + echo "检测结果:没有在常规路径下找到cuda/version.txt文件" + while true + do + read -p "请输入cuda/version.txt的路径:" cuda_version + if [ "$cuda_version" == "" || ! -f "$cuda_version" ];then + read -p "仍未找到CUDA,输入y将安装CPU版本的PaddlePaddle,输入n可重新录入CUDA路径,请输入(y/n)" cpu_option + cpu_option=`echo $cpu_option | tr 'A-Z' 'a-z'` + if [ "$cpu_option" == "y" || "$cpu_option" == "" ];then + GPU='cpu' + break + else + echo "重新输入..." + fi + else + CUDA=`cat $cuda_version | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'` + if [ "$CUDA" == "" ];then + echo "未能在version.txt中找到CUDA相关信息" + else + break + fi + fi + done + if [ "$GPU" == "cpu" ];then + break + fi + fi + + if [ "$CUDA" == "8" ] || [ "$CUDA" == "9" ];then + echo "您的CUDA版本是${CUDA}" + break + else + echo "目前支持CUDA8/9,暂不支持您的CUDA${CUDA},将为您安装CPU版本的PaddlePaddle" + echo + use_cpu + fi + + if [ "$GPU" == "cpu" ];then + break + fi + done +} + +function checkLinuxMathLibrary(){ + while true + do + if [ "$AVX" == "" ];then + echo "正在检测您环境中是否存在AVX指令集..." + echo + echo "检测结果:您电脑上没有AVX指令集,目前针对无AVX指令集的环境,我们仅提供支持mkl数学库的PaddlePaddle,将为您安装此版本的PaddlePaddle" + math='mkl' + break + elif [ "$GPU" == "gpu" ];then + math='mkl' + echo "检测到您的机器上配备GPU,推荐您使用mkl数学库" + break + else + read -p "请输入您希望使用的数学库: + 1:openblas 一个高性能多核 BLAS 库 + 2:mkl(推荐) 英特尔数学核心函数库 + => 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. mkl 】 。请在这里输入并回车:" math + if [ "$math" == "" ];then + math="mkl" + echo "您选择了数字【2】" + break + fi + if [ "$math" == "1" ];then + math=openblas + echo "您选择了数字【1】" + break + elif [ "$math" == "2" ];then + math=mkl + echo "您选择了数字【2】" + break + fi + echo "输入错误,请再次输入" + fi + done +} + +function checkLinuxPaddleVersion(){ + read -n1 -p "请按回车键继续..." + while true + do + read -p " + 1. 开发版:对应Github上develop分支,如您需要开发、或希望使用PaddlePaddle最新功能,请选用此版本 + 2. 稳定版(推荐):如您无特殊开发需求,建议使用此版本,目前最新的版本号为 ${release_version} + => 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. 稳定版 】 。请在这里输入并回车:" paddle_version + if [ "$paddle_version" == "" ];then + paddle_version="release-${release_version}" + echo "您选择了数字【2】,为您安装release-${release_version}" + break + fi + if [ "$paddle_version" == "1" ];then + echo "您选择了数字【1】,将为您安装开发版" + break + elif [ "$paddle_version" == "2" ];then + echo "您选择了数字【2】,为您安装release-${release_version}" + break + fi + echo "输入错误,请再次输入" + done +} + +function checkLinuxPip(){ + while true + do + echo "请输入您要使用的pip目录(您可以另起终端,并使用which pip来查看):" + read -p "" pip_path + if [ "$pip_path" == "" -o ! -f "$pip_path" ];then + echo "检测结果:pip不存在,请重新输入" + continue + fi + python_version=`$pip_path --version|awk -F "[ |)]" '{print $6}'|sed 's#\.##g'` + if [ "$python_version" == "27" ];then + uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27mu"` + if [[ "$uncode" == "" ]];then + uncode= + else + uncode=u + fi + fi + if [ "$python_version" == "" ];then + echo "检测结果:pip不存在,请重新输入" + else + version_list=`echo "${python_list[@]}" | grep "$python_version" ` + if [ "$version_list" != "" ];then + echo "检测结果:找到python${python_version}版本" + break + else + echo "检测结果:找不到可用的 pip, 我们只支持Python27/35/36/37及其对应的pip, 请重新输入, 或使用ctrl + c退出 " + fi + fi + done +} + +function checkLinuxAVX(){ + while true + do + if [[ "$AVX" != "" ]];then + AVX="avx" + break + else + if [ "$CUDA" == "8" -a "$CUDNN" == "7" ] || [ "$GPU" == "cpu" ];then + AVX="noavx" + break + else + echo "Step 6. 检测是否有avx" + echo + echo "检测结果:未能找到avx,我们仅提供CPU版本或配置为CUDA8 cuDNN7的GPU版本的安装包" + break + fi + fi + done +} + +function PipLinuxInstall(){ + wheel_cpu_release="http://paddle-wheel.bj.bcebos.com/${release_version}-${GPU}-${AVX}-${math}/paddlepaddle-${release_version}-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" + wheel_gpu_release="http://paddle-wheel.bj.bcebos.com/${release_version}-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-${release_version}.post${CUDA}${CUDNN}-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" + wheel_gpu_release_noavx="http://paddle-wheel.bj.bcebos.com/${release_version}-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-${release_version}-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" + wheel_cpu_develop="http://paddle-wheel.bj.bcebos.com/latest-cpu-${AVX}-${math}/paddlepaddle-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" + wheel_gpu_develop="http://paddle-wheel.bj.bcebos.com/latest-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" + + + if [[ "$paddle_version" == "2" ]];then + if [[ "$GPU" == "gpu" ]];then + if [[ ${AVX} == "avx" ]];then + rm -rf `echo $wheel_gpu_release|awk -F '/' '{print $NF}'` + wget -q $wheel_gpu_release + if [ "$?" == "0" ];then + $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release + else + echo "paddlepaddle whl包下载失败" + exit 1 + fi + else + rm -rf `echo $wheel_gpu_release_novax|awk -F '/' '{print $NF}'` + wget -q $wheel_gpu_release_novax + if [ "$?" == "0" ];then + $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release_noavx + else + echo "paddlepaddle whl包下载失败" + exit 1 + fi + fi + else + rm -rf `echo $wheel_cpu_release|awk -F '/' '{print $NF}'` + wget -q $wheel_cpu_release + if [ "$?" == "0" ];then + $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release + else + echo "paddlepaddle whl包下载失败" + exit 1 + fi + fi + else + if [[ "$GPU" == "gpu" ]];then + rm -rf `echo $wheel_gpu_develop|awk -F '/' '{print $NF}'` + wget -q $wheel_gpu_develop + if [ "$?" == "0" ];then + $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop + else + echo "paddlepaddle whl包下载失败" + exit 1 + fi + else + rm -rf `echo $wheel_cpu_develop|awk -F '/' '{print $NF}'` + wget -q $wheel_cpu_develop + if [ "$?" == "0" ];then + $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop + else + echo "paddlepaddle whl包下载失败" + exit 1 + fi + fi + fi +} + + +function checkLinuxGPU(){ + read -n1 -p "即将检测您的机器是否含GPU,请按回车键继续..." + echo + AVX=`cat /proc/cpuinfo |grep avx|tail -1|grep avx` + which nvidia-smi >/dev/null 2>&1 + if [ "$?" != "0" ];then + GPU='cpu' + echo "未在机器上找到GPU,或PaddlePaddle暂不支持此型号的GPU" + else + GPU='gpu' + echo "已在您的机器上找到GPU,即将确认CUDA和CUDNN版本..." + echo + fi + if [ "$GPU" == 'gpu' ];then + checkLinuxCUDA + checkLinuxCUDNN + fi +} + +function linux(){ +gpu_list=( +"GeForce 410M" +"GeForce 610M" +"GeForce 705M" +"GeForce 710M" +"GeForce 800M" +"GeForce 820M" +"GeForce 830M" +"GeForce 840M" +"GeForce 910M" +"GeForce 920M" +"GeForce 930M" +"GeForce 940M" +"GeForce GT 415M" +"GeForce GT 420M" +"GeForce GT 430" +"GeForce GT 435M" +"GeForce GT 440" +"GeForce GT 445M" +"GeForce GT 520" +"GeForce GT 520M" +"GeForce GT 520MX" +"GeForce GT 525M" +"GeForce GT 540M" +"GeForce GT 550M" +"GeForce GT 555M" +"GeForce GT 610" +"GeForce GT 620" +"GeForce GT 620M" +"GeForce GT 625M" +"GeForce GT 630" +"GeForce GT 630M" +"GeForce GT 635M" +"GeForce GT 640" +"GeForce GT 640 (GDDR5)" +"GeForce GT 640M" +"GeForce GT 640M LE" +"GeForce GT 645M" +"GeForce GT 650M" +"GeForce GT 705" +"GeForce GT 720" +"GeForce GT 720M" +"GeForce GT 730" +"GeForce GT 730M" +"GeForce GT 735M" +"GeForce GT 740" +"GeForce GT 740M" +"GeForce GT 745M" +"GeForce GT 750M" +"GeForce GTS 450" +"GeForce GTX 1050" +"GeForce GTX 1060" +"GeForce GTX 1070" +"GeForce GTX 1080" +"GeForce GTX 1080 Ti" +"GeForce GTX 460" +"GeForce GTX 460M" +"GeForce GTX 465" +"GeForce GTX 470" +"GeForce GTX 470M" +"GeForce GTX 480" +"GeForce GTX 480M" +"GeForce GTX 485M" +"GeForce GTX 550 Ti" +"GeForce GTX 560M" +"GeForce GTX 560 Ti" +"GeForce GTX 570" +"GeForce GTX 570M" +"GeForce GTX 580" +"GeForce GTX 580M" +"GeForce GTX 590" +"GeForce GTX 650" +"GeForce GTX 650 Ti" +"GeForce GTX 650 Ti BOOST" +"GeForce GTX 660" +"GeForce GTX 660M" +"GeForce GTX 660 Ti" +"GeForce GTX 670" +"GeForce GTX 670M" +"GeForce GTX 670MX" +"GeForce GTX 675M" +"GeForce GTX 675MX" +"GeForce GTX 680" +"GeForce GTX 680M" +"GeForce GTX 680MX" +"GeForce GTX 690" +"GeForce GTX 750" +"GeForce GTX 750 Ti" +"GeForce GTX 760" +"GeForce GTX 760M" +"GeForce GTX 765M" +"GeForce GTX 770" +"GeForce GTX 770M" +"GeForce GTX 780" +"GeForce GTX 780M" +"GeForce GTX 780 Ti" +"GeForce GTX 850M" +"GeForce GTX 860M" +"GeForce GTX 870M" +"GeForce GTX 880M" +"GeForce GTX 950" +"GeForce GTX 950M" +"GeForce GTX 960" +"GeForce GTX 960M" +"GeForce GTX 965M" +"GeForce GTX 970" +"GeForce GTX 970M" +"GeForce GTX 980" +"GeForce GTX 980M" +"GeForce GTX 980 Ti" +"GeForce GTX TITAN" +"GeForce GTX TITAN Black" +"GeForce GTX TITAN X" +"GeForce GTX TITAN Z" +"Jetson TK1" +"Jetson TX1" +"Jetson TX2" +"Mobile Products" +"NVIDIA NVS 310" +"NVIDIA NVS 315" +"NVIDIA NVS 510" +"NVIDIA NVS 810" +"NVIDIA TITAN V" +"NVIDIA TITAN X" +"NVIDIA TITAN Xp" +"NVS 4200M" +"NVS 5200M" +"NVS 5400M" +"Quadro 410" +"Quadro GP100" +"Quadro K1100M" +"Quadro K1200" +"Quadro K2000" +"Quadro K2000D" +"Quadro K2100M" +"Quadro K2200" +"Quadro K2200M" +"Quadro K3100M" +"Quadro K4000" +"Quadro K4100M" +"Quadro K420" +"Quadro K4200" +"Quadro K4200M" +"Quadro K5000" +"Quadro K500M" +"Quadro K5100M" +"Quadro K510M" +"Quadro K5200" +"Quadro K5200M" +"Quadro K600" +"Quadro K6000" +"Quadro K6000M" +"Quadro K610M" +"Quadro K620" +"Quadro K620M" +"Quadro M1000M" +"Quadro M1200" +"Quadro M2000" +"Quadro M2000M" +"Quadro M2200" +"Quadro M3000M" +"Quadro M4000" +"Quadro M4000M" +"Quadro M5000" +"Quadro M5000M" +"Quadro M500M" +"Quadro M520" +"Quadro M5500M" +"Quadro M6000" +"Quadro M6000 24GB" +"Quadro M600M" +"Quadro M620" +"Quadro Mobile Products" +"Quadro P1000" +"Quadro P2000" +"Quadro P3000" +"Quadro P400" +"Quadro P4000" +"Quadro P5000" +"Quadro P600" +"Quadro P6000" +"Quadro Plex 7000" +"Tegra K1" +"Tegra X1" +"Tesla C2050/C2070" +"Tesla C2075" +"Tesla Data Center Products" +"Tesla K10" +"Tesla K20" +"Tesla K40" +"Tesla K80" +"Tesla M40" +"Tesla M60" +"Tesla P100" +"Tesla P4" +"Tesla P40" +"Tesla V100") + + echo "Step 2. 检测GPU型号和CUDA/cuDNN版本" + echo + checkLinuxGPU + echo + echo "Step 3. 检测数学库" + echo + checkLinuxMathLibrary + echo + echo "Step 4. 选择要安装的PaddlePaddle版本" + echo + checkLinuxPaddleVersion + echo + echo "Step 5. 检测pip版本" + echo + checkLinuxPip + echo + checkLinuxAVX + echo "*********************2. 开始安装*****************************" + PipLinuxInstall +} + +function checkMacPython2(){ + while true + do + read -p " + => 未能在常规路径下找到Python2,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载安装Python2(注意Python版本不能低于2.7.15) + 如希望自定义Python路径,请输入路径:" python_root + echo + python_version=`$python_root --version 2>&1 1>&1` + if [ $? == "0" ];then + : + else + python_version="" + fi + check_python=`echo $python_version | grep "Python 2"` + if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then + python_version="" + elif [ -n "$check_python" ];then + while true + do + read -p " + => 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: " use_python + echo + use_python=`echo $use_python | tr 'A-Z' 'a-z'` + if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then + use_python="y" + break + elif [ "$use_python" == "n" ];then + python_root="" + break + else + echo "输入错误,请重新输入(y/n)" + fi + done + if [ "$use_python" == "y" ];then + break + fi + else + echo "您输入Python的不是Python2" + python_version="" + fi + done +} + +function checkMacPython3(){ + while true + do + read -p " + => 未能在常规路径下找到Python3,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载Python3 + 如希望自定义Python路径,请输入路径:" python_root + python_version=`$python_root --version 2>&1 1>&1` + if [ $? == "0" ];then + : + else + python_version="" + fi + check_python=`echo $python_version | grep "Python 3"` + if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then + python_version="" + elif [ -n "$check_python" ] ;then + while true + do + read -p " + => 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: " use_python + echo + use_python=`echo $use_python | tr 'A-Z' 'a-z'` + if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then + use_python="y" + break + elif [ "$use_python" == "n" ];then + python_root="" + break + else + echo "输入错误,请重新输入(y/n)" + fi + done + if [ "$use_python" == "y" ];then + break + fi + else + echo "您输入Python的不是Python3" + python_version="" + fi + done +} + +function checkMacPaddleVersion(){ + while true + do + read -n1 -p "Step 2. 选择PaddlePaddle的版本,请按回车键继续..." + echo + read -p " + 1. 开发版:对应Github上develop分支,如您需要开发、或希望使用PaddlePaddle最新功能,请选用此版本 + 2. 稳定版(推荐):如您无特殊开发需求,建议使用此版本,目前最新的版本号为 ${release_version} + + => 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. 稳定版 】 。请在这里输入并回车:" paddle_version + if [ "$paddle_version" == "1" ]||[ "$paddle_version" == "2" ];then + echo + echo "您选择了数字【"$paddle_version" 】" + echo + break + else + paddle_version="2" + echo + echo "您选择了数字【2】" + echo + break + fi + done +} + +function checkMacPythonVersion(){ + while true + do + read -n1 -p "Step 3. 选择Python版本,请按回车键继续..." + read -p " + 2. 使用python 2.x + 3. 使用python 3.x + + => 请输入数字2或3。如输入其他字符或直接回车,将会默认使用【Python 2 】。请在这里输入并回车:" python_V + echo + if [ "$python_V" == "" ];then + python_V="2" + fi + echo "您选择了数字【"$python_V"】,正在寻找符合您要求的Python版本,请按回车键继续..." + echo + if [ "$python_V" == "2" ];then + python_root=`which python2.7` + if [ "$python_root" == "" ];then + python_root=`which python` + fi + python_version=`$python_root --version 2>&1 1>&1` + if [ $? == "0" ];then + : + else + python_version="" + fi + if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ]||[ "$python_root" == "/usr/bin/python2.7" -a "$python_version" == "Python 2.7.10" ];then + checkMacPython2 + fi + while true + do + read -p " + => 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车:" use_python + echo + use_python=`echo $use_python | tr 'A-Z' 'a-z'` + if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then + break + elif [ "$use_python" == "n" ];then + python_root="" + checkMacPython2 + break + else + echo "输入错误,请重新输入(y/n)" + fi + done + + elif [ "$python_V" == "3" ];then + python_root=`which python3` + python_version=`$python_root --version 2>&1 1>&1` + if [ $? == "0" ];then + : + else + python_version="" + fi + if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ];then + checkMacPython3 + fi + while true + do + read -p " + => 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车:" use_python + echo + use_python=`echo $use_python | tr 'A-Z' 'a-z'` + if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then + break + elif [ "$use_python" == "n" ];then + checkMacPython3 + break + else + echo "输入错误,请重新输入(y/n)" + fi + done + else + : + fi + + + if [ "$python_V" == "2" ]||[ "$python_V" == "3" ];then + python_brief_version=`$python_root -m pip -V |awk -F "[ |)]" '{print $6}'|sed 's#\.##g'` + if [[ $python_brief_version == "27" ]];then + uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"` + if [[ $uncode == "" ]];then + uncode="mu" + else + uncode="m" + fi + fi + version_list=`echo "${python_list[@]}" | grep "$python_brief_version" ` + if [ "$version_list" != "" ];then + break + else + echo "未找到可用的pip或pip3。PaddlePaddle目前支持:Python2.7/3.5/3.6/3.7及其对应的pip, 请重新输入,或使用ctrl + c退出" + fi + else + echo "输入错误,请重新输入" + fi + done +} + +function checkMacAVX(){ + read -n1 -p "Step 4. 检测您的Mac是否支持AVX指令集,请按回车键继续..." + echo + if [[ $AVX != "" ]];then + AVX="avx" + echo "检测结果:支持" + else + read -n1 -p "检测结果:不支持。非常抱歉,PaddlePaddle在Mac系统暂不提供no_avx类型的安装包,您可以选择在Linux系统中安装no_avx版的PaddlePaddle, 请按回车键退出..." + exit + fi + echo +} + +function checkMacGPU(){ + read -n1 -p "Step 5. 选择CPU/GPU版本,请按回车键继续..." + echo + if [[ $GPU != "" ]];then + echo "MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle" + else + echo "MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle" + GPU=cpu + fi + echo +} + +function macos() { + path='http://paddlepaddle.org/download?url=' + AVX=`sysctl -a | grep cpu | grep AVX1.0 | tail -1 | grep AVX` + + while true + do + checkMacPaddleVersion + checkMacPythonVersion + checkMacAVX + checkMacGPU + + + echo "*********************2. 开始安装*****************************" + echo + read -n1 -p "即将为您下载并安装PaddlePaddle,请按回车键继续..." + echo + if [[ $paddle_version == "2" ]];then + $python_root -m pip install paddlepaddle + if [ $? == "0" ];then + echo "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器" + break + else + rm $whl_cpu_release + echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用" + echo"" + echo "==========================================================================================" + echo"" + exit 1 + fi + else + if [ -f $whl_cpu_develop ];then + $python_root -m pip install $whl_cpu_develop + if [ $? == "0" ];then + rm -rf $whl_cpu_develop + echo "安装成功!小提示:可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器" + break + else + echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用" + echo"" + echo "==========================================================================================" + echo"" + exit 1 + fi + else + wget ${path}$whl_cpu_develop -O $whl_cpu_develop + if [ $? == "0" ];then + $python_root -m pip install $whl_cpu_develop + if [ $? == "0" ];then + rm $wheel_cpu_develop + echo "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器" + break + else + rm $whl_cpu_release + echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用" + echo"" + echo "==========================================================================================" + echo"" + exit 1 + fi + else + rm $whl_cpu_develop + echo "未能正常安装PaddlePaddle,请检查您的网络 或者确认您是否安装有 wget,或者ctrl + c退出后反馈至https://github.com/PaddlePaddle/Paddle/issues" + echo"" + echo "==========================================================================================" + echo"" + exit 1 + fi + fi + fi + done +} + +function main() { + echo "*********************************" + echo "欢迎使用PaddlePaddle快速安装脚本" + echo "*********************************" + echo + echo "如果您在安装过程中遇到任何问题,请在https://github.com/PaddlePaddle/Paddle/issues反馈,我们的工作人员将会帮您答疑解惑" + echo + echo "本安装包将帮助您在Linux或Mac系统下安装PaddlePaddle,包括 1)安装前的准备和 2)开始安装 两部分" + echo + read -n1 -p "请按回车键进行下一步..." + echo + echo + echo "*********************1. 安装前的准备*****************************" + echo + echo "Step 1. 正在检测您的操作系统信息..." + echo + SYSTEM=`uname -s` + if [ "$SYSTEM" == "Darwin" ];then + echo "您的系统为:MAC OSX" + echo + macos + else + echo "您的系统为:Linux" + echo + OS=`cat /etc/issue|awk 'NR==1 {print $1}'` + if [ $OS == "\S" ] || [ "$OS" == "CentOS" ] || [ $OS == "Ubuntu" ];then + linux + else + echo "您的系统不在本安装包的支持范围,如您需要在windows环境下安装PaddlePaddle,请您参考PaddlePaddle官网的windows安装文档" + fi + fi +} +main diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 59e695e6fcb66cbaed1bcc9e861df81b5f73c1ed..90b8fd1a0aab159eb1a829d67485c845182d295b 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -54,7 +54,7 @@ ELSE(WIN32) DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) ENDIF() -set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS}) +set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS} ${external_project_dependencies}) add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps}) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 564882bd2a23437665777c646e6e399cdffae723..396f36e188b27fe450cc19b3b8ccf967daf1456c 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -158,7 +158,8 @@ def __bootstrap__(): 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus', 'sync_nccl_allreduce', 'limit_of_tmp_allocation', - 'times_excess_than_required_tmp_allocation' + 'times_excess_than_required_tmp_allocation', + 'enable_inplace_whitelist' ] core.init_gflags([sys.argv[0]] + diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index a35a4c59835e2a64a11ae156bed34d4b35696f73..ef0242942838fcca737a10fafbafa61bf520b532 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -174,6 +174,11 @@ class CompiledProgram(object): self._exec_strategy.num_threads = cpu_num * 2 trainers_endpoints = self._program._trainers_endpoints + + # FIXME(dzhwinter): enable_inplace should be after memory_optimize + # if turn on python memory optimize, turn off the inplace_pass. + self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True + if self._build_strategy.num_trainers > 1 and trainers_endpoints: assert self._build_strategy.num_trainers == len( trainers_endpoints), "num_trainers == len(end_points)" diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py index 6127ca8a3eacd013dd258a02b9f3cc792b634137..870c57e54011361caae5265201d19f58830a87bc 100644 --- a/python/paddle/fluid/contrib/__init__.py +++ b/python/paddle/fluid/contrib/__init__.py @@ -22,6 +22,8 @@ from . import op_frequence from .op_frequence import * from . import quantize from .quantize import * +from . import int8_inference +from .int8_inference import * from . import reader from .reader import * from . import slim @@ -34,6 +36,7 @@ __all__ += decoder.__all__ __all__ += memory_usage_calc.__all__ __all__ += op_frequence.__all__ __all__ += quantize.__all__ +__all__ += int8_inference.__all__ __all__ += reader.__all__ __all__ += slim.__all__ __all__ += utils.__all__ diff --git a/python/paddle/fluid/contrib/int8_inference/README.md b/python/paddle/fluid/contrib/int8_inference/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a9691dad4494f5eacf427b2806b2393baa57dc1e --- /dev/null +++ b/python/paddle/fluid/contrib/int8_inference/README.md @@ -0,0 +1,72 @@ +# Offline INT8 Calibration Tool + +PaddlePaddle supports offline INT8 calibration to accelerate the inference speed. In this document, we provide the instructions on how to enable INT8 calibration and show the ResNet-50 and MobileNet-V1 results in accuracy. + +## 0. Prerequisite +You need to install at least PaddlePaddle-1.3 python package `pip install paddlepaddle==1.3`. + +## 1. How to generate INT8 model +You can refer to the unit test in [test_calibration.py](../tests/test_calibration.py). Basically, there are three steps: +* Construct calibration object. + +```python +calibrator = int8_utility.Calibrator( # Step 1 + program=infer_program, # required, FP32 program + pretrained_model=model_path, # required, FP32 pretrained model + algo=algo, # required, calibration algorithm; default is max, the alternative is KL (Kullback–Leibler divergence) + exe=exe, # required, executor + output=int8_model, # required, INT8 model + feed_var_names=feed_dict, # required, feed dict + fetch_list=fetch_targets) # required, fetch targets +``` + +* Call the calibrator.sample_data() after executor run. +```python +_, acc1, _ = exe.run( + program, + feed={feed_dict[0]: image, + feed_dict[1]: label}, + fetch_list=fetch_targets) + +calibrator.sample_data() # Step 2 +``` + +* Call the calibrator.save_int8_model() after sampling over specified iterations (e.g., iterations = 50) +```python +calibrator.save_int8_model() # Step 3 +``` + +## 2. How to run INT8 model +You can load INT8 model by load_inference_model [API](https://github.com/PaddlePaddle/Paddle/blob/8b50ad80ff6934512d3959947ac1e71ea3fb9ea3/python/paddle/fluid/io.py#L991) and run INT8 inference similar as [FP32](https://github.com/PaddlePaddle/models/blob/develop/fluid/PaddleCV/object_detection/eval.py "FP32"). + +```python +[infer_program, feed_dict, + fetch_targets] = fluid.io.load_inference_model(model_path, exe) +``` + +## 3. Result +We provide the results of accuracy measurd on [Intel® Xeon® Platinum Gold Processor](https://ark.intel.com/products/120489/Intel-Xeon-Gold-6148-Processor-27-5M-Cache-2-40-GHz- "Intel® Xeon® Gold 6148 Processor") (also known as Intel® Xeon® Skylake6148). + +| Model | Dataset | FP32 Accuracy | INT8 Accuracy | Accuracy Diff | +| ------------ | ------------ | ------------ | ------------ | ------------ | +| ResNet-50 | Small | 72.00% | 72.00% | 0.00% | +| MobileNet-V1 | Small | 62.00% | 62.00% | 0.00% | +| ResNet-50 | Full ImageNet Val | 76.63% | 76.17% | 0.46% | +| MobileNet-V1 | Full ImageNet Val | 70.78% | 70.49% | 0.29% | + +Please note that [Small](http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz "Small") is a subset of [full ImageNet validation dataset](http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar "full ImageNet validation dataset"). + +Notes: +* The accuracy measurement requires the model with `label`. +* The INT8 theoretical speedup is ~1.33X on Intel® Xeon® Skylake Server (please refer to `This allows for 4x more input at the cost of 3x more instructions or 33.33% more compute` in [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")). + +## 4. How to reproduce the results +* Small dataset +```bash +python python/paddle/fluid/contrib/tests/test_calibration.py +``` + +* Full dataset +```bash +DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py +``` diff --git a/python/paddle/fluid/contrib/int8_inference/__init__.py b/python/paddle/fluid/contrib/int8_inference/__init__.py index eca2dce114b069bf9b455d77ce670d73b5047fd2..45547201d598c809f7dcf3a1a09103ae5de3e4c6 100644 --- a/python/paddle/fluid/contrib/int8_inference/__init__.py +++ b/python/paddle/fluid/contrib/int8_inference/__init__.py @@ -11,3 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from __future__ import print_function + +from . import utility +from .utility import * + +__all__ = utility.__all__ diff --git a/python/paddle/fluid/contrib/int8_inference/utility.py b/python/paddle/fluid/contrib/int8_inference/utility.py index 40de038f28a83738e6e6cd8c77c0a9916ce68b4f..b35d9f2424ccf093f70e75b13e23f6c5ad59e859 100644 --- a/python/paddle/fluid/contrib/int8_inference/utility.py +++ b/python/paddle/fluid/contrib/int8_inference/utility.py @@ -11,11 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import paddle.fluid.core as core + +from paddle.fluid import core import numpy as np import math import os -import paddle.fluid as fluid +from paddle.fluid.executor import global_scope +from paddle.fluid import io + +__all__ = ['Calibrator'] class Calibrator(object): @@ -76,8 +80,7 @@ class Calibrator(object): ''' for i in self.sampling_program.list_vars(): if i.name in self.sampling_vars: - np_data = np.array(fluid.global_scope().find_var(i.name) - .get_tensor()) + np_data = np.array(global_scope().find_var(i.name).get_tensor()) if i.name not in self._sampling_data: self._sampling_data[i.name] = [] self._sampling_data[i.name].append(np_data) @@ -86,9 +89,9 @@ class Calibrator(object): ''' Save the quantized model to the disk. ''' - fluid.io.save_inference_model(self.output, self.feed_var_names, - self.fetch_list, self.exe, - self.sampling_program) + io.save_inference_model(self.output, self.feed_var_names, + self.fetch_list, self.exe, + self.sampling_program) def __display_debug(self): if self.debug: diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py index f07fefe7e097377a845193bb37b6e9aa42708948..424ea245a0f2dff0d437ace386f2e4e0fa6b517d 100644 --- a/python/paddle/fluid/contrib/tests/test_calibration.py +++ b/python/paddle/fluid/contrib/tests/test_calibration.py @@ -19,15 +19,12 @@ import sys import random import paddle import paddle.fluid as fluid -import argparse import functools import contextlib -import paddle.fluid.profiler as profiler from paddle.dataset.common import download from PIL import Image, ImageEnhance import math -sys.path.append('..') -import int8_inference.utility as int8_utility +import paddle.fluid.contrib.int8_inference.utility as int8_utility random.seed(0) np.random.seed(0) @@ -43,7 +40,7 @@ img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) -# TODO(guomingz): Remove duplicated code from line 45 ~ line 114 +# TODO(guomingz): Remove duplicated code from resize_short, crop_image, process_image, _reader_creator def resize_short(img, target_size): percent = float(target_size) / min(img.size[0], img.size[1]) resized_width = int(round(img.size[0] * percent)) @@ -123,16 +120,37 @@ class TestCalibrationForResnet50(unittest.TestCase): self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' + self.int8_download) - data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz' - data_md5 = '1b6c1c434172cca1bf9ba1e4d7a3157d' - self.data_cache_folder = self.download_data(data_url, data_md5, "data") + data_urls = [] + data_md5s = [] + self.data_cache_folder = '' + if os.environ.get('DATASET') == 'full': + data_urls.append( + 'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partaa' + ) + data_md5s.append('60f6525b0e1d127f345641d75d41f0a8') + data_urls.append( + 'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab' + ) + data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5') + self.data_cache_folder = self.download_data(data_urls, data_md5s, + "full_data", False) + else: + data_urls.append( + 'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz' + ) + data_md5s.append('1b6c1c434172cca1bf9ba1e4d7a3157d') + self.data_cache_folder = self.download_data(data_urls, data_md5s, + "small_data", False) # reader/decorator.py requires the relative path to the data folder cmd = 'rm -rf {0} && ln -s {1} {0}'.format("data", self.data_cache_folder) os.system(cmd) - self.iterations = 50 + self.batch_size = 1 + self.sample_iterations = 50 + self.infer_iterations = 50000 if os.environ.get( + 'DATASET') == 'full' else 50 def cache_unzipping(self, target_folder, zip_path): if not os.path.exists(target_folder): @@ -140,20 +158,44 @@ class TestCalibrationForResnet50(unittest.TestCase): zip_path) os.system(cmd) - def download_data(self, data_url, data_md5, folder_name): - download(data_url, self.int8_download, data_md5) + def download_data(self, data_urls, data_md5s, folder_name, is_model=True): data_cache_folder = os.path.join(self.cache_folder, folder_name) - file_name = data_url.split('/')[-1] - zip_path = os.path.join(self.cache_folder, file_name) + zip_path = '' + if os.environ.get('DATASET') == 'full': + file_names = [] + for i in range(0, len(data_urls)): + download(data_urls[i], self.int8_download, data_md5s[i]) + file_names.append(data_urls[i].split('/')[-1]) + + zip_path = os.path.join(self.cache_folder, + 'full_imagenet_val.tar.gz') + if not os.path.exists(zip_path): + cat_command = 'cat' + for file_name in file_names: + cat_command += ' ' + os.path.join(self.cache_folder, + file_name) + cat_command += ' > ' + zip_path + os.system(cat_command) + + if os.environ.get('DATASET') != 'full' or is_model: + download(data_urls[0], self.int8_download, data_md5s[0]) + file_name = data_urls[0].split('/')[-1] + zip_path = os.path.join(self.cache_folder, file_name) + + print('Data is downloaded at {0}').format(zip_path) self.cache_unzipping(data_cache_folder, zip_path) return data_cache_folder - def download_resnet50_model(self): + def download_model(self): # resnet50 fp32 data - data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz' - data_md5 = '4a5194524823d9b76da6e738e1367881' - self.model_cache_folder = self.download_data(data_url, data_md5, + data_urls = [ + 'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz' + ] + data_md5s = ['4a5194524823d9b76da6e738e1367881'] + self.model_cache_folder = self.download_data(data_urls, data_md5s, "resnet50_fp32") + self.model = "ResNet-50" + self.algo = "direct" def run_program(self, model_path, generate_int8=False, algo='direct'): image_shape = [3, 224, 224] @@ -169,17 +211,17 @@ class TestCalibrationForResnet50(unittest.TestCase): t = fluid.transpiler.InferenceTranspiler() t.transpile(infer_program, fluid.CPUPlace()) - val_reader = paddle.batch(val(), batch_size=1) + val_reader = paddle.batch(val(), self.batch_size) + iterations = self.infer_iterations if generate_int8: int8_model = os.path.join(os.getcwd(), "calibration_out") + iterations = self.sample_iterations if os.path.exists(int8_model): os.system("rm -rf " + int8_model) os.system("mkdir " + int8_model) - print("Start calibration ...") - calibrator = int8_utility.Calibrator( program=infer_program, pretrained_model=model_path, @@ -191,6 +233,7 @@ class TestCalibrationForResnet50(unittest.TestCase): test_info = [] cnt = 0 + periods = [] for batch_id, data in enumerate(val_reader()): image = np.array( [x[0].reshape(image_shape) for x in data]).astype("float32") @@ -202,21 +245,28 @@ class TestCalibrationForResnet50(unittest.TestCase): if op.has_attr("use_mkldnn"): op._set_attr("use_mkldnn", True) + t1 = time.time() _, acc1, _ = exe.run( running_program, feed={feed_dict[0]: image, feed_dict[1]: label}, fetch_list=fetch_targets) + t2 = time.time() + period = t2 - t1 + periods.append(period) + if generate_int8: calibrator.sample_data() test_info.append(np.mean(acc1) * len(data)) cnt += len(data) - if batch_id != self.iterations - 1: - continue + if (batch_id + 1) % 100 == 0: + print("{0} images,".format(batch_id + 1)) + sys.stdout.flush() - break + if (batch_id + 1) == iterations: + break if generate_int8: calibrator.save_int8_model() @@ -225,32 +275,49 @@ class TestCalibrationForResnet50(unittest.TestCase): "Calibration is done and the corresponding files are generated at {}". format(os.path.abspath("calibration_out"))) else: - return np.sum(test_info) / cnt + throughput = cnt / np.sum(periods) + latency = np.average(periods) + acc1 = np.sum(test_info) / cnt + return (throughput, latency, acc1) def test_calibration(self): - self.download_resnet50_model() - fp32_acc1 = self.run_program(self.model_cache_folder + "/model") - self.run_program(self.model_cache_folder + "/model", True) - int8_acc1 = self.run_program("calibration_out") + self.download_model() + print("Start FP32 inference for {0} on {1} images ...").format( + self.model, self.infer_iterations) + (fp32_throughput, fp32_latency, + fp32_acc1) = self.run_program(self.model_cache_folder + "/model") + print("Start INT8 calibration for {0} on {1} images ...").format( + self.model, self.sample_iterations) + self.run_program( + self.model_cache_folder + "/model", True, algo=self.algo) + print("Start INT8 inference for {0} on {1} images ...").format( + self.model, self.infer_iterations) + (int8_throughput, int8_latency, + int8_acc1) = self.run_program("calibration_out") delta_value = np.abs(fp32_acc1 - int8_acc1) self.assertLess(delta_value, 0.01) + print( + "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}". + format(self.model, self.batch_size, fp32_throughput, fp32_latency, + fp32_acc1)) + print( + "INT8 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}". + format(self.model, self.batch_size, int8_throughput, int8_latency, + int8_acc1)) + sys.stdout.flush() class TestCalibrationForMobilenetv1(TestCalibrationForResnet50): - def download_mobilenetv1_model(self): + def download_model(self): # mobilenetv1 fp32 data - data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/mobilenetv1_int8_model.tar.gz' - data_md5 = '13892b0716d26443a8cdea15b3c6438b' - self.model_cache_folder = self.download_data(data_url, data_md5, + data_urls = [ + 'http://paddle-inference-dist.cdn.bcebos.com/int8/mobilenetv1_int8_model.tar.gz' + ] + data_md5s = ['13892b0716d26443a8cdea15b3c6438b'] + self.model_cache_folder = self.download_data(data_urls, data_md5s, "mobilenetv1_fp32") - - def test_calibration(self): - self.download_mobilenetv1_model() - fp32_acc1 = self.run_program(self.model_cache_folder + "/model") - self.run_program(self.model_cache_folder + "/model", True, algo='KL') - int8_acc1 = self.run_program("calibration_out") - delta_value = np.abs(fp32_acc1 - int8_acc1) - self.assertLess(delta_value, 0.01) + self.model = "MobileNet-V1" + self.algo = "KL" if __name__ == '__main__': diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 2bdae60db347b3d42fded138a20a505486e48dbc..c0b0ad8a202b82183de9ec1edd43cb10db10fb5c 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -445,11 +445,16 @@ class Variable(object): @property def _stop_gradient(self): - return self._ivar.stop_gradient + if _in_imperative_mode(): + return self._ivar.stop_gradient + else: + return self.stop_gradient @_stop_gradient.setter def _stop_gradient(self, s): - self._ivar.stop_gradient = s + if _in_imperative_mode(): + self._ivar.stop_gradient = s + self.stop_gradient = s @property def persistable(self): @@ -1310,6 +1315,9 @@ class Block(object): outputs=kwargs.get("outputs", None), attrs=kwargs.get("attrs", None)) self.ops.append(op) + + # TODO(minqiyang): add stop_gradient support in static mode too. + # currently, we only support stop_gradient in imperative mode. self._trace_op(op, kwargs.get("stop_gradient", False)) return op @@ -1717,6 +1725,19 @@ class Program(object): self._trainers_endpoints = [] # the distributed lookup table names self._distributed_lookup_table = None + # @deprecated(the python memory optimize transpiler is deprecated) + # whether the program is optimized by memory_optimize_transpiler + self.__is_mem_optimized = False + + @property + def _is_mem_optimized(self): + # if the program is optimized, operator input/outputs + # maybe same, which conflict with save_inference_model. + return self.__is_mem_optimized + + @_is_mem_optimized.setter + def _is_mem_optimized(self, target): + self.__is_mem_optimized = target @property def op_role(self): @@ -1736,7 +1757,7 @@ class Program(object): return self._current_role @op_role.setter - def set_op_role(self, role): + def op_role(self, role): self._current_role = role @property diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index f457f56203eb2c1da62f4d8ad8915c322c822e0a..71ff95bdea36967c1fa6b5c94cc7ca305e7a544a 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -15,6 +15,7 @@ import contextlib import sys import numpy as np +import collections from paddle.fluid import core from paddle.fluid import framework @@ -31,7 +32,23 @@ class Layer(core.Layer): self._dtype = dtype def parameters(self): - return [] + params = [] + for key in self.__dict__.keys(): + value = self.__dict__[key] + if isinstance(value, framework.Parameter): + params.append(value) + elif isinstance(value, core.Layer): + params.extend(value.parameters()) + elif isinstance(value, collections.Container): + if len(value) == 0: + continue + if isinstance(value[0], framework.Parameter): + params.extend(value) + elif isinstance(value[0], core.Layer): + for v in value: + params.extend(v.parameters()) + + return params def clear_gradients(self): for p in self.parameters(): diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py index 140c0ff037d453641cc119301269121025e17cbd..6c5961cc63d1c140e0a6f33aac054acdbbe8e8e0 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/imperative/nn.py @@ -22,13 +22,7 @@ from . import layers from ..framework import Variable, OpProtoHolder from ..param_attr import ParamAttr from ..initializer import Normal, Constant - -__all__ = [ - 'Conv2D', - 'Pool2D', - 'FC', - 'BatchNorm', -] +__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding'] class Conv2D(layers.Layer): @@ -332,21 +326,16 @@ class BatchNorm(layers.Layer): shape=param_shape, dtype=self._dtype, default_initializer=Constant(1.0)) - - # TODO(minqiyang): change stop_gradient sign to trainable to align with static graph - # # setting stop_gradient=True to reduce computation - # if use_global_stats and self._helper.param_attr.learning_rate == 0.: - # self._scale.stop_gradient = True + if use_global_stats and self._helper.param_attr.learning_rate == 0.: + self._scale._stop_gradient = True self._bias = self._helper.create_parameter( attr=self._helper.bias_attr, shape=param_shape, dtype=self._dtype, is_bias=True) - # TODO(minqiyang): change stop_gradient sign to trainable to align with static graph - # # setting stop_gradient=True to reduce computation - # if use_global_stats and self._helper.bias_attr.learning_rate == 0.: - # self._bias.stop_gradient = True + if use_global_stats and self._helper.bias_attr.learning_rate == 0.: + self._bias._stop_gradient = True self._mean = self._helper.create_parameter( attr=ParamAttr( @@ -356,7 +345,7 @@ class BatchNorm(layers.Layer): do_model_average=do_model_average_for_mean_and_var), shape=param_shape, dtype=self._dtype) - self._mean.stop_gradient = True + self._mean._stop_gradient = True self._variance = self._helper.create_parameter( attr=ParamAttr( @@ -366,7 +355,7 @@ class BatchNorm(layers.Layer): do_model_average=do_model_average_for_mean_and_var), shape=param_shape, dtype=self._dtype) - self._variance.stop_gradient = True + self._variance._stop_gradient = True self._in_place = in_place self._momentum = momentum @@ -419,3 +408,91 @@ class BatchNorm(layers.Layer): # Currently, we don't support inplace in imperative mode return self._helper.append_activation(batch_norm_out) + + +class Embedding(layers.Layer): + """ + **Embedding Layer** + + This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in + a lookup table. The result of this lookup is the embedding of each ID in the + :attr:`input`. + + All the input variables are passed in as local variables to the LayerHelper + constructor. + + Args: + size(tuple|list): The shape of the look up table parameter. It should + have two elements which indicate the size of the dictionary of + embeddings and the size of each embedding vector respectively. + is_sparse(bool): The flag indicating whether to use sparse update. + is_distributed(bool): Whether to run lookup table from remote parameter server. + padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup. + Otherwise the given :attr:`padding_idx` indicates padding the output + with zeros whenever lookup encounters it in :attr:`input`. If + :math:`padding_idx < 0`, the :attr:`padding_idx` to use in lookup is + :math:`size[0] + dim`. + param_attr(ParamAttr): Parameters for this layer + dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc + + Returns: + Variable: The tensor variable storing the embeddings of the \ + supplied inputs. + + Examples: + .. code-block:: python + + dict_size = len(dataset.ids) + input = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32') + embedding = fluid.imperative.Embedding(size=[dict_size, 16]) + fc = embedding(input) + """ + + def __init__(self, + size, + is_sparse=False, + is_distributed=False, + padding_idx=None, + param_attr=None, + dtype='float32'): + + super(Embedding, self).__init__() + self._size = size + self._is_sparse = is_sparse + self._is_distributed = is_distributed + + self._padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else ( + size[0] + padding_idx) + + self._param_attr = param_attr + self._dtype = dtype + self._remote_prefetch = self._is_sparse and (not self._is_distributed) + if self._remote_prefetch: + assert self._is_sparse is True and self._is_distributed is False + + from ..layer_helper import LayerHelper + self._helper = LayerHelper('embedding', param_attr=param_attr) + self._w = self._helper.create_parameter( + attr=self._param_attr, + shape=self._size, + dtype=self._dtype, + is_bias=False) + + def parameters(self): + return [self._w] + + def forward(self, input): + out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type='lookup_table', + inputs={'Ids': input, + 'W': self._w}, + outputs={'Out': out}, + attrs={ + 'is_sparse': self._is_sparse, + 'is_distributed': self._is_distributed, + 'remote_prefetch': self._remote_prefetch, + 'padding_idx': self._padding_idx + }) + + return out diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 4f434328e47df4363b304ff55f587018d3157c5e..5be21ff7f7270f6ce950c069f61418c922bcedc5 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -366,17 +366,40 @@ class TruncatedNormalInitializer(Initializer): # Initialization Ops should be prepended and not appended if self._seed == 0: self._seed = block.program.random_seed + + # to be compatible of fp16 initalizers + if var.dtype == VarDesc.VarType.FP16: + out_dtype = VarDesc.VarType.FP32 + out_var = block.create_var( + name=unique_name.generate(".".join( + ['truncated_gaussian_random', 'tmp'])), + shape=var.shape, + dtype=out_dtype, + type=VarDesc.VarType.LOD_TENSOR, + persistable=False) + else: + out_dtype = var.dtype + out_var = var + op = block._prepend_op( type="truncated_gaussian_random", - outputs={"Out": var}, + outputs={"Out": out_var}, attrs={ "shape": var.shape, - "dtype": int(var.dtype), + "dtype": out_dtype, "mean": self._mean, "std": self._std_dev, "seed": self._seed }, stop_gradient=True) + + if var.dtype == VarDesc.VarType.FP16: + block.append_op( + type="cast", + inputs={"X": out_var}, + outputs={"Out": var}, + attrs={"in_dtype": out_var.dtype, + "out_dtype": var.dtype}) var.op = op return op diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 6b1d4cc34f3cd40c878740f28618f26d5e89a6bd..a2abbf36c0267d85c9c97af00c9faabf1187822c 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -16,14 +16,16 @@ from __future__ import print_function import os import errno +import warnings import time import shutil import six from functools import reduce +from paddle.fluid import layers from paddle.fluid.executor import Executor from paddle.fluid.evaluator import Evaluator -from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable +from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable, program_guard from . import core __all__ = [ @@ -930,6 +932,24 @@ def save_inference_model(dirname, if main_program is None: main_program = default_main_program() + if main_program._is_mem_optimized: + warnings.warn( + "save_inference_model must put before you call memory_optimize. \ + the memory_optimize will modify the original program, \ + is not suitable for saving inference model \ + we save the original program as inference model.", + RuntimeWarning) + + # fix the bug that the activation op's output as target will be pruned. + # will affect the inference performance. + # TODO(Superjomn) add an IR pass to remove 1-scale op. + with program_guard(main_program): + uniq_target_vars = [] + for var in target_vars: + if isinstance(var, Variable): + var1 = layers.scale(var, 1.) + uniq_target_vars.append(var1) + target_vars = uniq_target_vars # when a pserver and a trainer running on the same machine, mkdir may conflict try: diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 972c51938f2b2282f8de4b090f9af3bc66f89155..a172141b3a0455769dc1ce74d098be057324e047 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -300,6 +300,17 @@ class LayerHelper(object): attr.name = unique_name.generate(".".join([self.name, suffix])) if default_initializer is None and attr.initializer is None: + if isinstance(dtype, core.VarDesc.VarType): + if dtype != core.VarDesc.VarType.FP32 and \ + dtype != core.VarDesc.VarType.FP64: + raise TypeError( + "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!" + ) + else: + if not (dtype.startswith("float") or dtype == "double"): + raise TypeError( + "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!" + ) if is_bias: attr._set_default_bias_initializer() else: diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index cddc302d52e0a5aea802fd7e1464f1e220c8f769..3b43ae0b9cb63a9f4708a680cb1021d74c197550 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -49,6 +49,8 @@ __all__ = [ 'box_coder', 'polygon_box_transform', 'yolov3_loss', + 'box_clip', + 'multiclass_nms', ] @@ -262,8 +264,10 @@ def detection_output(loc, number is N + 1, N is the batch size. The i-th image has `LoD[i + 1] - LoD[i]` detected results, if it is 0, the i-th image has no detected results. If all images have not detected results, - all the elements in LoD are 0, and output tensor only contains one + LoD will be set to {1}, and output tensor only contains one value, which is -1. + (After version 1.3, when no boxes detected, the lod is changed + from {0} to {1}.) Examples: .. code-block:: python @@ -343,19 +347,107 @@ def box_coder(prior_box, target_box, code_type="encode_center_size", box_normalized=True, - name=None): + name=None, + axis=0): """ - ${comment} + **Box Coder Layer** + + Encode/Decode the target bounding box with the priorbox information. + + The Encoding schema described below: + + .. math:: + + ox = (tx - px) / pw / pxv + + oy = (ty - py) / ph / pyv + + ow = \log(\abs(tw / pw)) / pwv + + oh = \log(\abs(th / ph)) / phv + + The Decoding schema described below: + + .. math:: + + ox = (pw * pxv * tx * + px) - tw / 2 + + oy = (ph * pyv * ty * + py) - th / 2 + + ow = \exp(pwv * tw) * pw + tw / 2 + + oh = \exp(phv * th) * ph + th / 2 + + where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, + width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote + the priorbox's (anchor) center coordinates, width and height. `pxv`, + `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, + `ow`, `oh` denote the encoded/decoded coordinates, width and height. + + During Box Decoding, two modes for broadcast are supported. Say target + box has shape [N, M, 4], and the shape of prior box can be [N, 4] or + [M, 4]. Then prior box will broadcast to target box along the + assigned axis. Args: - prior_box(${prior_box_type}): ${prior_box_comment} - prior_box_var(${prior_box_var_type}): ${prior_box_var_comment} - target_box(${target_box_type}): ${target_box_comment} - code_type(${code_type_type}): ${code_type_comment} - box_normalized(${box_normalized_type}): ${box_normalized_comment} + prior_box(Variable): Box list prior_box is a 2-D Tensor with shape + [M, 4] holds M boxes, each box is represented as + [xmin, ymin, xmax, ymax], [xmin, ymin] is the + left top coordinate of the anchor box, if the + input is image feature map, they are close to + the origin of the coordinate system. [xmax, ymax] + is the right bottom coordinate of the anchor box. + prior_box_var(Variable|list|None): prior_box_var supports two types + of input. One is variable with shape [M, 4] + holds M group. The other one is list consist of + 4 elements shared by all boxes. + target_box(Variable): This input can be a 2-D LoDTensor with shape + [N, 4] when code_type is 'encode_center_size'. + This input also can be a 3-D Tensor with shape + [N, M, 4] when code_type is 'decode_center_size'. + Each box is represented as + [xmin, ymin, xmax, ymax]. This tensor can + contain LoD information to represent a batch + of inputs. + code_type(string): The code type used with the target box. It can be + encode_center_size or decode_center_size + box_normalized(int): Whether treat the priorbox as a noramlized box. + Set true by default. + name(string): The name of box coder. + axis(int): Which axis in PriorBox to broadcast for box decode, + for example, if axis is 0 and TargetBox has shape + [N, M, 4] and PriorBox has shape [M, 4], then PriorBox + will broadcast to [N, M, 4] for decoding. It is only valid + when code type is decode_center_size. Set 0 by default. Returns: - output_box(${output_box_type}): ${output_box_comment} + output_box(Variable): When code_type is 'encode_center_size', the + output tensor of box_coder_op with shape + [N, M, 4] representing the result of N target + boxes encoded with M Prior boxes and variances. + When code_type is 'decode_center_size', + N represents the batch size and M represents + the number of deocded boxes. + + Examples: + + .. code-block:: python + + prior_box = fluid.layers.data(name='prior_box', + shape=[512, 4], + dtype='float32', + append_batch_size=False) + target_box = fluid.layers.data(name='target_box', + shape=[512,81,4], + dtype='float32', + append_batch_size=False) + output = fluid.layers.box_coder(prior_box=prior_box, + prior_box_var=[0.1,0.1,0.2,0.2], + target_box=target_box, + code_type="decode_center_size", + box_normalized=False, + axis=1) + """ helper = LayerHelper("box_coder", **locals()) @@ -366,15 +458,22 @@ def box_coder(prior_box, output_box = helper.create_variable( name=name, dtype=prior_box.dtype, persistable=False) + inputs = {"PriorBox": prior_box, "TargetBox": target_box} + attrs = { + "code_type": code_type, + "box_normalized": box_normalized, + "axis": axis + } + if isinstance(prior_box_var, Variable): + inputs['PriorBoxVar'] = prior_box_var + elif isinstance(prior_box_var, list): + attrs['variance'] = prior_box_var + else: + raise TypeError("Input variance of box_coder must be Variable or lisz") helper.append_op( type="box_coder", - inputs={ - "PriorBox": prior_box, - "PriorBoxVar": prior_box_var, - "TargetBox": target_box - }, - attrs={"code_type": code_type, - "box_normalized": box_normalized}, + inputs=inputs, + attrs=attrs, outputs={"OutputBox": output_box}) return output_box @@ -410,13 +509,10 @@ def yolov3_loss(x, gtbox, gtlabel, anchors, + anchor_mask, class_num, ignore_thresh, - loss_weight_xy=None, - loss_weight_wh=None, - loss_weight_conf_target=None, - loss_weight_conf_notarget=None, - loss_weight_class=None, + downsample_ratio, name=None): """ ${comment} @@ -428,16 +524,13 @@ def yolov3_loss(x, and x, y, w, h should be relative value of input image. N is the batch number and B is the max box number in an image. - gtlabel (Variable): class id of ground truth boxes, shoud be ins shape + gtlabel (Variable): class id of ground truth boxes, shoud be in shape of [N, B]. anchors (list|tuple): ${anchors_comment} + anchor_mask (list|tuple): ${anchor_mask_comment} class_num (int): ${class_num_comment} ignore_thresh (float): ${ignore_thresh_comment} - loss_weight_xy (float|None): ${loss_weight_xy_comment} - loss_weight_wh (float|None): ${loss_weight_wh_comment} - loss_weight_conf_target (float|None): ${loss_weight_conf_target_comment} - loss_weight_conf_notarget (float|None): ${loss_weight_conf_notarget_comment} - loss_weight_class (float|None): ${loss_weight_class_comment} + downsample_ratio (int): ${downsample_ratio_comment} name (string): the name of yolov3 loss Returns: @@ -457,9 +550,10 @@ def yolov3_loss(x, x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32') gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32') gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32') - anchors = [10, 13, 16, 30, 33, 23] - loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80 - anchors=anchors, ignore_thresh=0.5) + anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326] + anchors = [0, 1, 2] + loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors, + ignore_thresh=0.5, downsample_ratio=32) """ helper = LayerHelper('yolov3_loss', **locals()) @@ -471,6 +565,8 @@ def yolov3_loss(x, raise TypeError("Input gtlabel of yolov3_loss must be Variable") if not isinstance(anchors, list) and not isinstance(anchors, tuple): raise TypeError("Attr anchors of yolov3_loss must be list or tuple") + if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple): + raise TypeError("Attr anchor_mask of yolov3_loss must be list or tuple") if not isinstance(class_num, int): raise TypeError("Attr class_num of yolov3_loss must be an integer") if not isinstance(ignore_thresh, float): @@ -483,31 +579,29 @@ def yolov3_loss(x, loss = helper.create_variable( name=name, dtype=x.dtype, persistable=False) + objectness_mask = helper.create_variable_for_type_inference(dtype='int32') + gt_match_mask = helper.create_variable_for_type_inference(dtype='int32') + attrs = { "anchors": anchors, + "anchor_mask": anchor_mask, "class_num": class_num, "ignore_thresh": ignore_thresh, + "downsample_ratio": downsample_ratio, } - if loss_weight_xy is not None and isinstance(loss_weight_xy, float): - self.attrs['loss_weight_xy'] = loss_weight_xy - if loss_weight_wh is not None and isinstance(loss_weight_wh, float): - self.attrs['loss_weight_wh'] = loss_weight_wh - if loss_weight_conf_target is not None and isinstance( - loss_weight_conf_target, float): - self.attrs['loss_weight_conf_target'] = loss_weight_conf_target - if loss_weight_conf_notarget is not None and isinstance( - loss_weight_conf_notarget, float): - self.attrs['loss_weight_conf_notarget'] = loss_weight_conf_notarget - if loss_weight_class is not None and isinstance(loss_weight_class, float): - self.attrs['loss_weight_class'] = loss_weight_class - helper.append_op( type='yolov3_loss', - inputs={"X": x, - "GTBox": gtbox, - "GTLabel": gtlabel}, - outputs={'Loss': loss}, + inputs={ + "X": x, + "GTBox": gtbox, + "GTLabel": gtlabel, + }, + outputs={ + 'Loss': loss, + 'ObjectnessMask': objectness_mask, + 'GTMatchMask': gt_match_mask + }, attrs=attrs) return loss @@ -1960,3 +2054,169 @@ def generate_proposals(scores, rpn_roi_probs.stop_gradient = True return rpn_rois, rpn_roi_probs + + +def box_clip(input, im_info, name=None): + """ + Clip the box into the size given by im_info + For each input box, The formula is given as follows: + + .. code-block:: text + + xmin = max(min(xmin, im_w - 1), 0) + ymin = max(min(ymin, im_h - 1), 0) + xmax = max(min(xmax, im_w - 1), 0) + ymax = max(min(ymax, im_h - 1), 0) + + where im_w and im_h are computed from im_info: + + .. code-block:: text + + im_h = round(height / scale) + im_w = round(weight / scale) + + Args: + input(variable): The input box, the last dimension is 4. + im_info(variable): The information of image with shape [N, 3] with + layout (height, width, scale). height and width + is the input size and scale is the ratio of input + size and original size. + name (str): The name of this layer. It is optional. + + Returns: + Variable: The cliped tensor variable. + + Examples: + .. code-block:: python + + boxes = fluid.layers.data( + name='data', shape=[8, 4], dtype='float32', lod_level=1) + im_info = fluid.layers.data(name='im_info', shape=[3]) + out = fluid.layers.box_clip( + input=boxes, im_info=im_info, inplace=True) + """ + + helper = LayerHelper("box_clip", **locals()) + output = helper.create_variable_for_type_inference(dtype=input.dtype) + inputs = {"Input": input, "ImInfo": im_info} + helper.append_op(type="box_clip", inputs=inputs, outputs={"Output": output}) + + return output + + +def multiclass_nms(bboxes, + scores, + score_threshold, + nms_top_k, + keep_top_k, + nms_threshold=0.3, + normalized=True, + nms_eta=1., + background_label=0, + name=None): + """ + **Multiclass NMS** + + This operator is to do multi-class non maximum suppression (NMS) on + boxes and scores. + + In the NMS step, this operator greedily selects a subset of detection bounding + boxes that have high scores larger than score_threshold, if providing this + threshold, then selects the largest nms_top_k confidences scores if nms_top_k + is larger than -1. Then this operator pruns away boxes that have high IOU + (intersection over union) overlap with already selected boxes by adaptive + threshold NMS based on parameters of nms_threshold and nms_eta. + + Aftern NMS step, at most keep_top_k number of total bboxes are to be kept + per image if keep_top_k is larger than -1. + + Args: + bboxes (Variable): Two types of bboxes are supported: + 1. (Tensor) A 3-D Tensor with shape + [N, M, 4 or 8 16 24 32] represents the + predicted locations of M bounding bboxes, + N is the batch size. Each bounding box has four + coordinate values and the layout is + [xmin, ymin, xmax, ymax], when box size equals to 4. + 2. (LoDTensor) A 3-D Tensor with shape [M, C, 4] + M is the number of bounding boxes, C is the + class number + scores (Variable): Two types of scores are supported: + 1. (Tensor) A 3-D Tensor with shape [N, C, M] + represents the predicted confidence predictions. + N is the batch size, C is the class number, M is + number of bounding boxes. For each category there + are total M scores which corresponding M bounding + boxes. Please note, M is equal to the 2nd dimension + of BBoxes. + 2. (LoDTensor) A 2-D LoDTensor with shape [M, C]. + M is the number of bbox, C is the class number. + In this case, input BBoxes should be the second + case with shape [M, C, 4]. + background_label (int): The index of background label, the background + label will be ignored. If set to -1, then all + categories will be considered. Default: 0 + score_threshold (float): Threshold to filter out bounding boxes with + low confidence score. If not provided, + consider all boxes. + nms_top_k (int): Maximum number of detections to be kept according to + the confidences aftern the filtering detections based + on score_threshold. + nms_threshold (float): The threshold to be used in NMS. Default: 0.3 + nms_eta (float): The threshold to be used in NMS. Default: 1.0 + keep_top_k (int): Number of total bboxes to be kept per image after NMS + step. -1 means keeping all bboxes after NMS step. + normalized (bool): Whether detections are normalized. Default: True + name(str): Name of the multiclass nms op. Default: None. + + Returns: + Out: A 2-D LoDTensor with shape [No, 6] represents the detections. + Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax] + or A 2-D LoDTensor with shape [No, 10] represents the detections. + Each row has 10 values: + [label, confidence, x1, y1, x2, y2, x3, y3, x4, y4]. No is the + total number of detections. If there is no detected boxes for all + images, lod will be set to {1} and Out only contains one value + which is -1. + (After version 1.3, when no boxes detected, the lod is changed + from {0} to {1}) + + + Examples: + .. code-block:: python + + + boxes = fluid.layers.data(name='bboxes', shape=[81, 4], + dtype='float32', lod_level=1) + scores = fluid.layers.data(name='scores', shape=[81], + dtype='float32', lod_level=1) + out = fluid.layers.multiclass_nms(bboxes=boxes, + scores=scores, + background_label=0, + score_threshold=0.5, + nms_top_k=400, + nms_threshold=0.3, + keep_top_k=200, + normalized=False) + """ + helper = LayerHelper('multiclass_nms', **locals()) + + output = helper.create_variable_for_type_inference(dtype=bboxes.dtype) + helper.append_op( + type="multiclass_nms", + inputs={'BBoxes': bboxes, + 'Scores': scores}, + attrs={ + 'background_label': background_label, + 'score_threshold': score_threshold, + 'nms_top_k': nms_top_k, + 'nms_threshold': nms_threshold, + 'nms_eta': nms_eta, + 'keep_top_k': keep_top_k, + 'nms_eta': nms_eta, + 'normalized': normalized + }, + outputs={'Out': output}) + output.stop_gradient = True + + return output diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index beb5e31211c5f9aa6bddfcb1da7e63d6480e99e1..0e4b5aadc0b0d7e87ea1cfb8e18339fe211e1eef 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -932,7 +932,7 @@ def dynamic_gru(input, create ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is initialized with Xavier. Default: None. bias_attr (ParamAttr|bool|None): The parameter attribute for the bias - of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates + of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates the bias in the update gate, reset gate and candidate calculations. If it is set to False, no bias will be applied to the update gate, reset gate and candidate calculations. If it is set to None or one @@ -1073,7 +1073,7 @@ def gru_unit(input, create ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is initialized with Xavier. Default: None. bias_attr (ParamAttr|bool|None): The parameter attribute for the bias - of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates + of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates the bias in the update gate, reset gate and candidate calculations. If it is set to False, no bias will be applied to the update gate, reset gate and candidate calculations. If it is set to None or one @@ -3877,7 +3877,8 @@ def beam_search(pre_ids, end_id, level=0, is_accumulated=True, - name=None): + name=None, + return_parent_idx=False): """ Beam search is a classical algorithm for selecting candidate words in a machine translation task. @@ -3933,10 +3934,16 @@ def beam_search(pre_ids, accumulated scores. name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. + return_parent_idx(bool): Whether to return an extra Tensor variable + preserving the selected_ids' parent indice in pre_ids + in output, which can be used to gather cell states at + the next time step. Returns: - Variable: The LodTensor pair containing the selected ids and the \ - corresponding scores. + Variable: The LodTensor tuple containing the selected ids and the \ + corresponding scores. If :attr:`return_parent_idx` is :attr:`True`, \ + an extra Tensor variable preserving the selected_ids' parent indice \ + is included. Examples: .. code-block:: python @@ -3969,6 +3976,11 @@ def beam_search(pre_ids, selected_scores = helper.create_variable_for_type_inference( dtype=score_type) selected_ids = helper.create_variable_for_type_inference(dtype=id_type) + # parent_idx is a tensor used to gather cell states at the next time + # step. Though lod in selected_ids can also be used to gather by + # sequence_expand, it is not efficient. + # gather_op's index input only supports int32 dtype currently + parent_idx = helper.create_variable_for_type_inference(dtype="int32") helper.append_op( type='beam_search', @@ -3976,6 +3988,7 @@ def beam_search(pre_ids, outputs={ 'selected_ids': selected_ids, 'selected_scores': selected_scores, + 'parent_idx': parent_idx }, attrs={ # TODO(ChunweiYan) to assure other value support @@ -3984,8 +3997,10 @@ def beam_search(pre_ids, 'end_id': end_id, 'is_accumulated': is_accumulated, }) - - return selected_ids, selected_scores + if return_parent_idx: + return selected_ids, selected_scores, parent_idx + else: + return selected_ids, selected_scores def beam_search_decode(ids, scores, beam_size, end_id, name=None): @@ -5403,7 +5418,7 @@ def transpose(x, perm, name=None): Examples: .. code-block:: python - # use append_batch_size=False to avoid prepending extra + # use append_batch_size=False to avoid prepending extra # batch size in shape x = fluid.layers.data(name='x', shape=[5, 10, 15], dtype='float32', append_batch_size=False) @@ -5920,7 +5935,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None): than :attr:`shape`. act (str): The non-linear activation to be applied to the reshaped tensor variable. - inplace(bool): Must use :attr:`False` if :attr:`x` is used in multiple + inplace(bool): Must use :attr:`False` if :attr:`x` is used in multiple operators. If this flag is set :attr:`True`, reuse input :attr:`x` to reshape, which will change the shape of tensor variable :attr:`x` and might cause errors when @@ -6581,7 +6596,9 @@ def image_resize(input, scale=None, name=None, resample='BILINEAR', - actual_shape=None): + actual_shape=None, + align_corners=True, + align_mode=1): """ **Resize a Batch of Images** @@ -6594,6 +6611,80 @@ def image_resize(input, 'NEAREST' : Nearest neighbor interpolation + Nearest neighbor interpolation is to perform nearest neighbor interpolation + in both the 3rd dimention(in height direction) and the 4th dimention(in width + direction) on input tensor. + + Bilinear interpolation is an extension of linear interpolation for + interpolating functions of two variables (e.g. H-direction and + W-direction in this op) on a rectilinear 2D grid. The key idea is + to perform linear interpolation first in one direction, and then + again in the other direction. + + Align_corners and align_mode are optinal parameters,the calculation method + of interpolation can be selected by them. + + Example: + + For scale: + + if align_corners = True && out_size > 1 : + + scale_factor = (in_size-1.0)/(out_size-1.0) + + else: + + scale_factor = float(in_size/out_size) + + + Nearest neighbor interpolation: + + if: + align_corners = False + + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + + H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor + W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor + + else: + align_corners = True + + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + + H_out = round(H_{in} * scale_{factor}) + W_out = round(W_{in} * scale_{factor}) + + Bilinear interpolation: + + if: + align_corners = False , align_mode = 0 + + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + + H_out = (H_{in}+0.5) * scale_{factor} - 0.5 + W_out = (W_{in}+0.5) * scale_{factor} - 0.5 + + + else: + + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + + H_out = H_{in} * scale_{factor} + W_out = W_{in} * scale_{factor} + + For details of nearest neighbor interpolation, please refer to Wikipedia: + https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation. + + For details of bilinear interpolation, please refer to Wikipedia: + https://en.wikipedia.org/wiki/Bilinear_interpolation. + + + Args: input (Variable): The input tensor of image resize layer, This is a 4-D tensor of the shape @@ -6623,6 +6714,13 @@ def image_resize(input, set, otherwise errors would be occured in graph constructing stage. Default: None + align_corners(bool) : An optional bool, If True, the centers of the 4 corner pixels of the + input and output tensors are aligned, preserving the values at the + corner pixels. + Default: True + align_mode(int) : An optional for bilinear interpolation. can be \'0\' + for src_idx = scale*(dst_indx+0.5)-0.5 , can be \'1\' for + src_idx = scale*dst_index . Returns: Variable: The output is a 4-D tensor of the shape @@ -6635,6 +6733,8 @@ def image_resize(input, or 'NEAREST' currently. ValueError: One of out_shape and scale must not be None. ValueError: out_shape length should be 2. + TypeError: align_corners shoule be a bool value + ValueError: align_mode can only be '0' or '1' Examples: .. code-block:: python @@ -6650,6 +6750,12 @@ def image_resize(input, "The 'resample' of image_resize can only be 'BILINEAR' or 'NEAREST' currently." ) resample_type = resample_methods[resample] + + if not isinstance(align_corners, bool): + raise TypeError("Attr align_corners should be a bool value") + if align_mode != 0 and align_mode != 1: + raise ValueError("align_mode can only be 0 or 1") + if out_shape is None and scale is None: raise ValueError("One of out_shape and scale must not be None.") helper = LayerHelper('{}_interp'.format(resample_type), **locals()) @@ -6689,9 +6795,13 @@ def image_resize(input, type='{}_interp'.format(resample_type), inputs=inputs, outputs={"Out": out}, - attrs={"out_h": out_h, - "out_w": out_w, - "interp_method": resample_type}) + attrs={ + "out_h": out_h, + "out_w": out_w, + "interp_method": resample_type, + "align_corners": align_corners, + "align_mode": align_mode + }) return out @@ -6700,7 +6810,9 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None, - actual_shape=None): + actual_shape=None, + align_corners=True, + align_mode=1): """ Resize input by performing bilinear interpolation based on given output shape which specified by actual_shape, out_shape and scale @@ -6715,6 +6827,47 @@ def resize_bilinear(input, For details of bilinear interpolation, please refer to Wikipedia: https://en.wikipedia.org/wiki/Bilinear_interpolation + Align_corners and align_mode are optinal parameters,the calculation + method of interpolation can be selected by them. + + + Align_corners and align_mode are optinal parameters,the calculation method + of interpolation can be selected by them. + + Example: + + For scale: + + if align_corners = True && out_size > 1 : + + scale_factor = (in_size-1.0)/(out_size-1.0) + + else: + + scale_factor = float(in_size/out_size) + + Bilinear interpolation: + + if: + align_corners = False , align_mode = 0 + + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + + H_out = (H_{in}+0.5) * scale_{factor} - 0.5 + W_out = (W_{in}+0.5) * scale_{factor} - 0.5 + + + else: + + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + + H_out = H_{in} * scale_{factor} + W_out = W_{in} * scale_{factor} + + + Args: input(${x_type}): ${x_comment}. @@ -6738,6 +6891,8 @@ def resize_bilinear(input, set, otherwise errors would be occured in graph constructing stage. Default: None + align_corners(bool): ${align_corners_comment} + align_mode(bool): ${align_mode_comment} Returns: ${out_comment}. @@ -6748,7 +6903,8 @@ def resize_bilinear(input, out = fluid.layers.resize_bilinear(input, out_shape=[12, 12]) """ - return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape) + return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape, + align_corners, align_mode) @templatedoc(op_type="nearest_interp") @@ -6756,13 +6912,48 @@ def resize_nearest(input, out_shape=None, scale=None, name=None, - actual_shape=None): + actual_shape=None, + align_corners=True): """ Resize input by performing nearest neighbor interpolation in both the 3rd dimention(in height direction) and the 4th dimention(in width direction) based on given output shape which specified by actual_shape, out_shape and scale in priority order. + Example: + + For scale: + + if align_corners = True && out_size > 1 : + + scale_factor = (in_size-1.0)/(out_size-1.0) + + else: + + scale_factor = float(in_size/out_size) + + + Nearest neighbor interpolation: + + if: + align_corners = False + + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + + H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor + W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor + + else: + align_corners = True + + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + + H_out = round(H_{in} * scale_{factor}) + W_out = round(W_{in} * scale_{factor}) + + For details of nearest neighbor interpolation, please refer to Wikipedia: https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation @@ -6789,6 +6980,7 @@ def resize_nearest(input, set, otherwise errors would be occured in graph constructing stage. Default: None + align_corners(bool): ${align_corners_comment} Returns: ${out_comment}. @@ -6799,7 +6991,8 @@ def resize_nearest(input, out = fluid.layers.resize_nearest(input, out_shape=[12, 12]) """ - return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape) + return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape, + align_corners) def image_resize_short(input, out_short_len, resample='BILINEAR'): diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 6c18af7283e19bd431c8d543255d900dc89cba09..3dcf9dc06998be9c38a48f18075cbf99f3dccb1a 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -135,7 +135,7 @@ def thresholded_relu(x, threshold=None): if val is not None: kwargs[name] = val - _thresholded_relu_(**kwargs) + return _thresholded_relu_(**kwargs) thresholded_relu.__doc__ = _thresholded_relu_.__doc__ + """ diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 14f4276e2f4fc4a24d701ef05c94b88c4f0336da..e0e781a322b3eb68e3f54a66252a8d8b11a9a56f 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -387,7 +387,7 @@ class Optimizer(object): params_grads = [] for param in parameters: - if param.stop_gradient: + if param.stop_gradient or not param.trainable: continue # create gradient variable grad_var = Variable( diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index a07ff6ac69ca20c8c68659a67606076ce8cdf027..52b260efd15066a114a8146106685043654c91ea 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -146,6 +146,9 @@ class ParallelExecutor(object): # step4: get main_program, scope, local_scopes main = main_program if main_program \ else framework.default_main_program() + # FIXME(dzhwinter): enable_inplace should be after memory_optimize + # if turn on python memory optimize, turn off the inplace_pass. + build_strategy.enable_inplace = False if main._is_mem_optimized else True scope = scope if scope is not None else executor.global_scope() if share_vars_from and not isinstance(share_vars_from, diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 2d9ed9f9c69a15af454bfec5918fd8bab27d6e4c..0d39a139eed87f900b1f59fd0569b6acaec0962b 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -50,6 +50,19 @@ class TestDetection(unittest.TestCase): self.assertEqual(out.shape[-1], 6) print(str(program)) + def test_box_coder_api(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[4], dtype='float32') + y = layers.data(name='z', shape=[4], dtype='float32', lod_level=1) + bcoder = layers.box_coder( + prior_box=x, + prior_box_var=[0.1, 0.2, 0.1, 0.2], + target_box=y, + code_type='encode_center_size') + self.assertIsNotNone(bcoder) + print(str(program)) + def test_detection_api(self): program = Program() with program_guard(program): @@ -463,11 +476,33 @@ class TestYoloDetection(unittest.TestCase): x = layers.data(name='x', shape=[30, 7, 7], dtype='float32') gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32') gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32') - loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13], 10, - 0.5) + loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13], + [0, 1], 10, 0.7, 32) self.assertIsNotNone(loss) +class TestBoxClip(unittest.TestCase): + def test_box_clip(self): + program = Program() + with program_guard(program): + input_box = layers.data( + name='input_box', shape=[7, 4], dtype='float32', lod_level=1) + im_info = layers.data(name='im_info', shape=[3], dtype='float32') + out = layers.box_clip(input_box, im_info) + self.assertIsNotNone(out) + + +class TestMulticlassNMS(unittest.TestCase): + def test_multiclass_nms(self): + program = Program() + with program_guard(program): + bboxes = layers.data( + name='bboxes', shape=[-1, 10, 4], dtype='float32') + scores = layers.data(name='scores', shape=[-1, 10], dtype='float32') + output = layers.multiclass_nms(bboxes, scores, 0.3, 400, 200, 0.7) + self.assertIsNotNone(output) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index c23dfa01e76c21d0d162f2fed986e2eaf3a70a6d..4b26bacce968a6da72e9aa043adb38918b293a35 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1,15 +1,6 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") -# The MKLDNN tests are skiped when the MKLDNN flag is OFF -if(NOT WITH_MKLDNN) - foreach(src ${TEST_OPS}) - if(${src} MATCHES ".*_mkldnn_op$") - list(REMOVE_ITEM TEST_OPS ${src}) - endif() - endforeach() -endif(NOT WITH_MKLDNN) - if(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_recv_op) list(REMOVE_ITEM TEST_OPS test_dist_transpiler) @@ -85,6 +76,7 @@ list(REMOVE_ITEM TEST_OPS test_image_classification_resnet) list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) list(REMOVE_ITEM TEST_OPS test_imperative_resnet) +list(REMOVE_ITEM TEST_OPS test_imperative_optimizer) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) @@ -94,6 +86,8 @@ py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL) py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL) py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS FLAGS_cudnn_deterministic=1) +py_test_modules(test_imperative_optimizer MODULES test_imperative_optimizer ENVS + FLAGS_cudnn_deterministic=1) if(WITH_DISTRIBUTE) py_test_modules(test_dist_train MODULES test_dist_train SERIAL) set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) @@ -116,7 +110,15 @@ py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executo if(NOT APPLE) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) endif() +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + # change the timeout from 600 to 900, because in debug mode, this test need more time. + set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 900) +endif() if (WITH_NGRAPH) add_subdirectory(ngraph) endif() + +if (WITH_MKLDNN) + add_subdirectory(mkldnn) +endif() diff --git a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..f71e04c09aa38b8cf7b3a167b84d4dc0e6cc3ec7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt @@ -0,0 +1,6 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +foreach(TEST_OP ${TEST_OPS}) + py_test_modules(${TEST_OP} MODULES ${TEST_OP}) +endforeach(TEST_OP) diff --git a/python/paddle/fluid/tests/unittests/mkldnn/__init__.py b/python/paddle/fluid/tests/unittests/mkldnn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b94a21a7e406b833797f8f521c62a2351c2bc30a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py similarity index 94% rename from python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py index 611d0dd076b827b0f528f2e3a31182cc4939d1f1..ad94a4b21c347c9a2782437948c20d3b3071c679 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py @@ -17,9 +17,9 @@ from __future__ import print_function import unittest import numpy as np import paddle.fluid.core as core -from op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest from scipy.special import expit -from test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs +from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs class TestMKLDNNReluDim2(TestRelu): diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py similarity index 92% rename from python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py index 1286cee8dc1855c1b1695da46ae0b5222c065114..5fce90372d9beda9b04ab68d0a8ac5ef5c124421 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py @@ -19,9 +19,9 @@ import numpy as np import paddle.fluid.core as core from paddle.fluid.op import Operator import paddle.fluid as fluid -from op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest from paddle.fluid.framework import grad_var_name -from test_batch_norm_op import TestBatchNormOpInference, TestBatchNormOpTraining, _reference_training, _reference_grad +from paddle.fluid.tests.unittests.test_batch_norm_op import TestBatchNormOpInference, TestBatchNormOpTraining, _reference_training, _reference_grad class TestMKLDNNBatchNormOpTraining(TestBatchNormOpTraining): diff --git a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py similarity index 94% rename from python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py index 0f2130f9049c7ee294444282e59c654551f76603..1a399740692eab8ccea0c984a1a4f2ac984eb045 100644 --- a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py @@ -15,7 +15,7 @@ from __future__ import print_function import unittest -from test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3 +from paddle.fluid.tests.unittests.test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3 class TestMKLDNNConcatOp(TestConcatOp): diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py similarity index 98% rename from python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py index 5ad376cb08e488e85be6369a91d4e81031e9e9db..100a03cea0f740a615c4a08810d4ad9e8c974d7a 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py @@ -18,8 +18,8 @@ import unittest import numpy as np import paddle.fluid.core as core -from op_test import OpTest -from test_conv2d_op import conv2d_forward_naive, TestConv2dOp +from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp def conv2d_forward_refer(input, filter, group, conv_param): diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py similarity index 91% rename from python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py index 438d45b84033b697c3210acc44392b93bf436df0..0542eef80070cbf281ee013c28b7092a2dd17eaa 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest -from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1 +from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1 class TestMKLDNN(TestConv2dOp): diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py similarity index 94% rename from python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py index deefdd09abe6b9f9ca362654f21850f598337245..9bcdb7b2a975b648471714ab628caf91b6b6f3a9 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest -from test_conv2d_transpose_op import TestConv2dTransposeOp, TestWithPad, TestWithStride +from paddle.fluid.tests.unittests.test_conv2d_transpose_op import TestConv2dTransposeOp, TestWithPad, TestWithStride class TestMKLDNN(TestConv2dTransposeOp): diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py similarity index 91% rename from python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py index f0e1265e142b800587599783367eca2203033bf1..080b74502fbe83e97e88a65866e0d9b66b37033e 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest -from test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1 +from paddle.fluid.tests.unittests.test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1 class TestMKLDNN(TestConv3dOp): diff --git a/python/paddle/fluid/tests/unittests/test_dequantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py similarity index 97% rename from python/paddle/fluid/tests/unittests/test_dequantize_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py index 0c5e1abd7c8fb010357998c0ceaebaf21619fda9..9a54f927cbde648bbbb06d043bbc1391ee43c314 100644 --- a/python/paddle/fluid/tests/unittests/test_dequantize_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import numpy as np -from op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest class TestDeQuantizeOp(OpTest): diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py similarity index 97% rename from python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py index d85cc1f856df8eaa73cef318b48a292042488edf..c3a42656b71d09dbc22abf8ce2ddc243b43b422f 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py @@ -16,8 +16,8 @@ from __future__ import print_function import unittest import numpy as np import paddle.fluid.core as core -from op_test import OpTest -from test_elementwise_add_op import * +from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.test_elementwise_add_op import * ''' Some tests differ from the tests defined in test_elementwise_add_op.py because MKLDNN does not support tensors of number of dimensions 3. diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py similarity index 98% rename from python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py index 536e9a1c58ec4a8b1b5a7c1d3a5fe737b38d24ab..738715dd70181988028adff1c50be3a52199c312 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py @@ -15,10 +15,10 @@ from __future__ import print_function import unittest import numpy as np -from op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest import paddle.fluid.core as core from paddle.fluid.op import Operator -from test_elementwise_mul_op import * +from paddle.fluid.tests.unittests.test_elementwise_mul_op import * class TestElementwiseMulMKLDNNOp_BroadcastNCHW16c(ElementwiseMulOp): diff --git a/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py similarity index 98% rename from python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py index 45951a34d6f61a242cb2dc004d6801a6c1c9dd92..84229a5cffbb466ef3c69cd997adacfb21f6aae2 100644 --- a/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import numpy as np -from op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest def fully_connected_naive(input, weights, bias_data=None): diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py similarity index 90% rename from python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py index 9777ec390656d3f6166bf9f5de7bbad8b6bd786d..c18bd77bd3e6de08283f3ac3a31c73453f3c9129 100644 --- a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest -from test_gaussian_random_op import TestGaussianRandomOp +from paddle.fluid.tests.unittests.test_gaussian_random_op import TestGaussianRandomOp class TestMKLDNN(TestGaussianRandomOp): diff --git a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py similarity index 96% rename from python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py index f6bb2ab7a696c40cb61dd5b38ca702b577fe7ea2..a5e6e116a5f1bc1e051ce3cfdac8cd1e5f3ed90e 100644 --- a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py @@ -15,7 +15,7 @@ from __future__ import print_function import unittest -from test_lrn_op import TestLRNOp +from paddle.fluid.tests.unittests.test_lrn_op import TestLRNOp class TestLRNMKLDNNOp(TestLRNOp): diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py similarity index 94% rename from python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py index f4495d0bc8198189962d033ec18b8b67f1f47c84..fca906fecc5fe8d25b9251c886398f8df778043f 100644 --- a/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py @@ -19,8 +19,8 @@ import unittest import numpy as np import paddle.fluid.core as core -from op_test import OpTest -from test_pool2d_op import TestPool2D_Op, avg_pool2D_forward_naive, max_pool2D_forward_naive +from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, avg_pool2D_forward_naive, max_pool2D_forward_naive class TestPool2dMKLDNNInt8_Op(TestPool2D_Op): diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py similarity index 90% rename from python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py index 7de5fefc148021d4109da2ac9f4b36c93a05a23f..6de43dd46e5d184ec934f2d85e0c87137e9702e0 100644 --- a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py @@ -15,7 +15,7 @@ from __future__ import print_function import unittest -from test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5 +from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5 def create_test_mkldnn_class(parent): diff --git a/python/paddle/fluid/tests/unittests/test_quantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py similarity index 97% rename from python/paddle/fluid/tests/unittests/test_quantize_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py index 99607928648be437b7f944f86a0c28b99d1775c4..132f7bd039f7797fb0fc332d6f7b8c242af46535 100644 --- a/python/paddle/fluid/tests/unittests/test_quantize_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import numpy as np -from op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest class TestQuantizeOp(OpTest): diff --git a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py similarity index 92% rename from python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py index 55820f31b81df9f3618d1004f6d21565564efa29..5928047b5171bcf33b024040ce79577b8aa0b53a 100644 --- a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest -from test_sum_op import TestSumOp +from paddle.fluid.tests.unittests.test_sum_op import TestSumOp class TestMKLDNN(TestSumOp): diff --git a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py similarity index 95% rename from python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py rename to python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py index 0c201b9e4f48df94924a248d820ae2cf73367560..4845eefe367f1ad6a2eb6ffd1f9b0598b1b4fbbd 100644 --- a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest -from test_transpose_op import TestTransposeOp +from paddle.fluid.tests.unittests.test_transpose_op import TestTransposeOp class TestTransposeMKLDNN(TestTransposeOp): diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py new file mode 100644 index 0000000000000000000000000000000000000000..13a33e20478372af370d38ab2b475e4425dc8d6e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py @@ -0,0 +1,30 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle.fluid.core as core +from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.test_accuracy_op import TestAccuracyOp + + +class TestNGRAPHAccuracyOp(TestAccuracyOp): + def setUp(self): + super(TestNGRAPHAccuracyOp, self).setUp() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py new file mode 100644 index 0000000000000000000000000000000000000000..e5424e8a6e615820b4a1a5f2ee7e7e87dd0b22af --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py @@ -0,0 +1,52 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +from paddle.fluid.tests.unittests.test_conv2d_op import * + + +class TestNGRAPH(TestConv2dOp): + def init_kernel_type(self): + super(TestNGRAPH, self).init_kernel_type() + + +class TestNGRAPHWithPad(TestWithPad): + def init_kernel_type(self): + super(TestNGRAPHWithPad, self).init_kernel_type() + + +class TestNGRAPHWithStride(TestWithStride): + def init_kernel_type(self): + super(TestNGRAPHWithStride, self).init_kernel_type() + + +class TestNGRAPHWithGroup(TestWithGroup): + def init_kernel_type(self): + super(TestNGRAPHWithGroup, self).init_kernel_type() + + +class TestNGRAPHWith1x1(TestWith1x1): + def init_kernel_type(self): + super(TestNGRAPHWith1x1, self).init_kernel_type() + + +class TestNGRAPHWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): + def init_kernel_type(self): + super(TestNGRAPHWithInput1x1Filter1x1, self).init_kernel_type() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py new file mode 100644 index 0000000000000000000000000000000000000000..95e592e8ec036ad231ed57ddbc706683cb7aa153 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py @@ -0,0 +1,51 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from paddle.fluid.tests.unittests.test_pool2d_op import * + + +class TestNGRAPHPool2D_Op(TestPool2D_Op): + def init_test_case(self): + super(TestNGRAPHPool2D_Op, self).init_test_case() + + +class TestNGRAPHCase1(TestCase1): + def init_test_case(self): + super(TestNGRAPHCase1, self).init_test_case() + + +class TestNGRAPHCase2(TestCase2): + def init_test_case(self): + super(TestNGRAPHCase2, self).init_test_case() + + +class TestNGRAPHCase3(TestCase3): + def init_pool_type(self): + super(TestNGRAPHCase3, self).init_pool_type() + + +class TestNGRAPHCase4(TestCase4): + def init_pool_type(self): + super(TestNGRAPHCase4, self).init_pool_type() + + +class TestNGRAPHCase5(TestCase5): + def init_pool_type(self): + super(TestNGRAPHCase5, self).init_pool_type() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index fdacd241f9e1f8d442f55098e2d192a3d57fdaf1..c429c8af7d37cb4e209edc41f704868afe054829 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -40,7 +40,8 @@ class TestParallelExecutorBase(unittest.TestCase): seed=None, use_parallel_executor=True, use_reduce=False, - use_ir_memory_optimize=False, + use_ir_memory_optimize=True, + enable_inplace=True, fuse_elewise_add_act_ops=False, fuse_relu_depthwise_conv=False, optimizer=fluid.optimizer.Adam, @@ -60,63 +61,65 @@ class TestParallelExecutorBase(unittest.TestCase): main.random_seed = seed loss = method(use_feed=feed_dict is not None) - if optimizer: optimizer().minimize(loss) if memory_opt: fluid.memory_optimize(main) - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup) - exec_strategy = fluid.ExecutionStrategy() - exec_strategy.allow_op_delay = allow_op_delay - if use_fast_executor: - exec_strategy.use_experimental_executor = True - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ - if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce - build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops - build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv - build_strategy.memory_optimize = use_ir_memory_optimize - build_strategy.enable_sequential_execution = enable_sequential_execution - if use_cuda and core.is_compiled_with_cuda(): - build_strategy.remove_unnecessary_lock = True - if use_parallel_executor: - binary = compiler.CompiledProgram(main).with_data_parallel( - loss_name=loss.name, - build_strategy=build_strategy, - exec_strategy=exec_strategy) - else: - binary = compiler.CompiledProgram(main) - - if batch_size is not None: - batch_size *= fluid.core.get_cuda_device_count( - ) if use_cuda else int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - begin = time.time() - first_loss, = run_executor( - exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) - - for i in range(iter): - run_executor( - exe=exe, binary=binary, feed=feed_dict, fetch_list=[]) - - last_loss, = run_executor( - exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) - end = time.time() - - if batch_size is not None: - print("%.4f Instance per second" % ( - (batch_size * iter + 2) / (end - begin))) - - avg_last_loss_val = np.array(last_loss).mean() - avg_first_loss_val = np.array(first_loss).mean() - if math.isnan(float(avg_last_loss_val)) or math.isnan( - float(avg_first_loss_val)): - sys.exit("got NaN loss, training failed.") - - print(first_loss, last_loss) - # self.assertGreater(first_loss[0], last_loss[0]) - return first_loss, last_loss + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup) + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.allow_op_delay = allow_op_delay + if use_fast_executor: + exec_strategy.use_experimental_executor = True + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ + if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops + build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv + build_strategy.memory_optimize = use_ir_memory_optimize + # python memory optimization is conflict with inplace pass. + # Use ir graph memory optimization after inplace pass is the correct way. + build_strategy.enable_inplace = False if memory_opt else enable_inplace + build_strategy.enable_sequential_execution = enable_sequential_execution + + if use_cuda and core.is_compiled_with_cuda(): + build_strategy.remove_unnecessary_lock = True + if use_parallel_executor: + binary = compiler.CompiledProgram(main).with_data_parallel( + loss_name=loss.name, + build_strategy=build_strategy, + exec_strategy=exec_strategy) + else: + binary = compiler.CompiledProgram(main) + + if batch_size is not None: + batch_size *= fluid.core.get_cuda_device_count( + ) if use_cuda else int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + begin = time.time() + first_loss, = run_executor( + exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) + + for i in range(iter): + run_executor(exe=exe, binary=binary, feed=feed_dict, fetch_list=[]) + + last_loss, = run_executor( + exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) + end = time.time() + + if batch_size is not None: + print("%.4f Instance per second" % ( + (batch_size * iter + 2) / (end - begin))) + + avg_last_loss_val = np.array(last_loss).mean() + avg_first_loss_val = np.array(first_loss).mean() + if math.isnan(float(avg_last_loss_val)) or math.isnan( + float(avg_first_loss_val)): + sys.exit("got NaN loss, training failed.") + + print(first_loss, last_loss) + # self.assertGreater(first_loss[0], last_loss[0]) + return first_loss, last_loss diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py index c28dda4b53ce5d394ff11222e5df8d257b4e80da..1d9f4b78f30fefa21c189036c3731e0afe39ea9e 100644 --- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py +++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py @@ -38,6 +38,7 @@ class BeamSearchOpTester(unittest.TestCase): self._create_pre_ids() self.scope.var('selected_ids') self.scope.var('selected_scores') + self.scope.var('parent_idx') def test_run(self): op = Operator( @@ -48,12 +49,14 @@ class BeamSearchOpTester(unittest.TestCase): scores='scores', selected_ids='selected_ids', selected_scores='selected_scores', + parent_idx='parent_idx', level=0, beam_size=2, end_id=0, ) op.run(self.scope, core.CPUPlace()) selected_ids = self.scope.find_var("selected_ids").get_tensor() selected_scores = self.scope.find_var("selected_scores").get_tensor() + parent_idx = self.scope.find_var("parent_idx").get_tensor() self.assertTrue( np.allclose( np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis])) @@ -62,6 +65,8 @@ class BeamSearchOpTester(unittest.TestCase): np.array(selected_scores), np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis])) self.assertEqual(selected_ids.lod(), [[0, 2, 4], [0, 1, 2, 3, 4]]) + self.assertTrue( + np.allclose(np.array(parent_idx), np.array([0, 1, 2, 3]))) def _create_pre_ids(self): np_data = np.array([[1, 2, 3, 4]], dtype='int64') diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py index c8a7063dc1cd3e5cc7cd3458b51f5e74981aa75c..f60ed1d79ae5778f751d6101fde386ae3a90c0f7 100644 --- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py +++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py @@ -20,7 +20,13 @@ from op_test import OpTest import paddle.fluid.core as core -def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None): +def bilinear_interp_np(input, + out_h, + out_w, + out_size=None, + actual_shape=None, + align_corners=True, + align_mode=0): """bilinear interpolation implement in shape [N, C, H, W]""" if out_size is not None: out_h = out_size[0] @@ -29,25 +35,45 @@ def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None): out_h = actual_shape[0] out_w = actual_shape[1] batch_size, channel, in_h, in_w = input.shape + + ratio_h = ratio_w = 0.0 if out_h > 1: - ratio_h = (in_h - 1.0) / (out_h - 1.0) - else: - ratio_h = 0.0 + if (align_corners): + ratio_h = (in_h - 1.0) / (out_h - 1.0) + else: + ratio_h = 1.0 * in_h / out_h if out_w > 1: - ratio_w = (in_w - 1.0) / (out_w - 1.0) - else: - ratio_w = 0.0 + if (align_corners): + ratio_w = (in_w - 1.0) / (out_w - 1.0) + else: + ratio_w = 1.0 * in_w / out_w out = np.zeros((batch_size, channel, out_h, out_w)) + for i in range(out_h): - h = int(ratio_h * i) + if (align_mode == 0 and not align_corners): + h = int(ratio_h * (i + 0.5) - 0.5) + else: + h = int(ratio_h * i) + + h = max(0, h) hid = 1 if h < in_h - 1 else 0 - h1lambda = ratio_h * i - h + if (align_mode == 0 and not align_corners): + h1lambda = ratio_h * (i + 0.5) - 0.5 - h + else: + h1lambda = ratio_h * i - h h2lambda = 1.0 - h1lambda for j in range(out_w): - w = int(ratio_w * j) + if (align_mode == 0 and not align_corners): + w = int(ratio_w * (j + 0.5) - 0.5) + else: + w = int(ratio_w * j) + w = max(0, w) wid = 1 if w < in_w - 1 else 0 - w1lambda = ratio_w * j - w + if (align_mode == 0 and not align_corners): + w1lambda = ratio_w * (j + 0.5) - 0.5 - w + else: + w1lambda = ratio_w * j - w w2lambda = 1.0 - w1lambda out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] + @@ -66,7 +92,8 @@ class TestBilinearInterpOp(OpTest): input_np = np.random.random(self.input_shape).astype("float32") output_np = bilinear_interp_np(input_np, self.out_h, self.out_w, - self.out_size, self.actual_shape) + self.out_size, self.actual_shape, + self.align_corners, self.align_mode) self.inputs = {'X': input_np} if self.out_size is not None: self.inputs['OutSize'] = self.out_size @@ -75,7 +102,9 @@ class TestBilinearInterpOp(OpTest): self.attrs = { 'out_h': self.out_h, 'out_w': self.out_w, - 'interp_method': self.interp_method + 'interp_method': self.interp_method, + 'align_corners': self.align_corners, + 'align_mode': self.align_mode } self.outputs = {'Out': output_np} @@ -91,6 +120,8 @@ class TestBilinearInterpOp(OpTest): self.out_h = 2 self.out_w = 2 self.out_size = np.array([3, 3]).astype("int32") + self.align_corners = True + self.align_mode = 1 class TestBilinearInterpCase1(TestBilinearInterpOp): @@ -99,6 +130,8 @@ class TestBilinearInterpCase1(TestBilinearInterpOp): self.input_shape = [4, 1, 7, 8] self.out_h = 1 self.out_w = 1 + self.align_corners = True + self.align_mode = 1 class TestBilinearInterpCase2(TestBilinearInterpOp): @@ -107,6 +140,8 @@ class TestBilinearInterpCase2(TestBilinearInterpOp): self.input_shape = [3, 3, 9, 6] self.out_h = 12 self.out_w = 12 + self.align_corners = True + self.align_mode = 1 class TestBilinearInterpCase3(TestBilinearInterpOp): @@ -115,6 +150,8 @@ class TestBilinearInterpCase3(TestBilinearInterpOp): self.input_shape = [1, 1, 128, 64] self.out_h = 64 self.out_w = 128 + self.align_corners = True + self.align_mode = 1 class TestBilinearInterpCase4(TestBilinearInterpOp): @@ -124,6 +161,8 @@ class TestBilinearInterpCase4(TestBilinearInterpOp): self.out_h = 1 self.out_w = 1 self.out_size = np.array([2, 2]).astype("int32") + self.align_corners = True + self.align_mode = 1 class TestBilinearInterpCase5(TestBilinearInterpOp): @@ -133,6 +172,8 @@ class TestBilinearInterpCase5(TestBilinearInterpOp): self.out_h = 12 self.out_w = 12 self.out_size = np.array([11, 11]).astype("int32") + self.align_corners = True + self.align_mode = 1 class TestBilinearInterpCase6(TestBilinearInterpOp): @@ -142,6 +183,8 @@ class TestBilinearInterpCase6(TestBilinearInterpOp): self.out_h = 64 self.out_w = 128 self.out_size = np.array([65, 129]).astype("int32") + self.align_corners = True + self.align_mode = 1 class TestBilinearInterpActualShape(TestBilinearInterpOp): @@ -151,6 +194,8 @@ class TestBilinearInterpActualShape(TestBilinearInterpOp): self.out_h = 64 self.out_w = 32 self.out_size = np.array([66, 40]).astype("int32") + self.align_corners = True + self.align_mode = 1 class TestBilinearInterpOpUint8(OpTest): @@ -162,14 +207,17 @@ class TestBilinearInterpOpUint8(OpTest): input_np = np.random.randint( low=0, high=256, size=self.input_shape).astype("uint8") output_np = bilinear_interp_np(input_np, self.out_h, self.out_w, - self.out_size, self.actual_shape) + self.out_size, self.actual_shape, + self.align_corners, self.align_mode) self.inputs = {'X': input_np} if self.out_size is not None: self.inputs['OutSize'] = self.out_size self.attrs = { 'out_h': self.out_h, 'out_w': self.out_w, - 'interp_method': self.interp_method + 'interp_method': self.interp_method, + 'align_corners': self.align_corners, + 'align_mode': self.align_mode } self.outputs = {'Out': output_np} @@ -181,6 +229,8 @@ class TestBilinearInterpOpUint8(OpTest): self.input_shape = [1, 3, 9, 6] self.out_h = 10 self.out_w = 9 + self.align_corners = True + self.align_mode = 1 class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8): @@ -189,6 +239,8 @@ class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8): self.input_shape = [2, 3, 128, 64] self.out_h = 120 self.out_w = 50 + self.align_corners = True + self.align_mode = 1 class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8): @@ -198,6 +250,26 @@ class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8): self.out_h = 5 self.out_w = 13 self.out_size = np.array([6, 15]).astype("int32") + self.align_corners = True + self.align_mode = 1 + + +class TestBilinearInterpOtherMethod1(TestBilinearInterpOp): + def set_align_mode(self): + self.align_corners = False + self.align_mode = 1 + + +class TestBilinearInterpWithMethod2(TestBilinearInterpOp): + def set_align_mode(self): + self.align_corners = False + self.align_mode = 0 + + +class TestBilinearInterpWithMethod3(TestBilinearInterpOp): + def set_align_mode(self): + self.align_corners = True + self.align_mode = 0 if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_box_clip_op.py b/python/paddle/fluid/tests/unittests/test_box_clip_op.py new file mode 100644 index 0000000000000000000000000000000000000000..b2b0598f31dd27e12e5ce329129129b5e0f1caf0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_box_clip_op.py @@ -0,0 +1,70 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest +import copy + + +def box_clip(input_box, im_info, output_box): + im_w = round(im_info[1] / im_info[2]) + im_h = round(im_info[0] / im_info[2]) + output_box[:, :, 0] = np.maximum( + np.minimum(input_box[:, :, 0], im_w - 1), 0) + output_box[:, :, 1] = np.maximum( + np.minimum(input_box[:, :, 1], im_h - 1), 0) + output_box[:, :, 2] = np.maximum( + np.minimum(input_box[:, :, 2], im_w - 1), 0) + output_box[:, :, 3] = np.maximum( + np.minimum(input_box[:, :, 3], im_h - 1), 0) + + +def batch_box_clip(input_boxes, im_info, lod): + n = input_boxes.shape[0] + m = input_boxes.shape[1] + output_boxes = np.zeros((n, m, 4), dtype=np.float32) + cur_offset = 0 + for i in range(len(lod)): + box_clip(input_boxes[cur_offset:(cur_offset + lod[i]), :, :], + im_info[i, :], + output_boxes[cur_offset:(cur_offset + lod[i]), :, :]) + cur_offset += lod[i] + return output_boxes + + +class TestBoxClipOp(OpTest): + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "box_clip" + lod = [[1, 2, 3]] + input_boxes = np.random.random((6, 10, 4)) * 5 + im_info = np.array([[5, 8, 1.], [6, 6, 1.], [7, 5, 1.]]) + output_boxes = batch_box_clip(input_boxes, im_info, lod[0]) + + self.inputs = { + 'Input': (input_boxes.astype('float32'), lod), + 'ImInfo': im_info.astype('float32'), + } + self.outputs = {'Output': output_boxes} + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py index 2511c5c22e012babdeb71a71d3546456ea2ceaf3..220bffebe83925c60af65aa9594ddd8a29c38145 100644 --- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py +++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py @@ -21,80 +21,82 @@ import math from op_test import OpTest -def box_coder(target_box, prior_box, prior_box_var, output_box, code_type, - box_normalized): - prior_box_x = ( - (prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0]) - prior_box_y = ( - (prior_box[:, 3] + prior_box[:, 1]) / 2).reshape(1, prior_box.shape[0]) - prior_box_width = ( - (prior_box[:, 2] - prior_box[:, 0])).reshape(1, prior_box.shape[0]) - prior_box_height = ( - (prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0]) - prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0], - prior_box_var.shape[1]) - if not box_normalized: - prior_box_height = prior_box_height + 1 - prior_box_width = prior_box_width + 1 - - if (code_type == "EncodeCenterSize"): - target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape( - target_box.shape[0], 1) - target_box_y = ((target_box[:, 3] + target_box[:, 1]) / 2).reshape( - target_box.shape[0], 1) - target_box_width = ((target_box[:, 2] - target_box[:, 0])).reshape( - target_box.shape[0], 1) - target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape( - target_box.shape[0], 1) - if not box_normalized: - target_box_height = target_box_height + 1 - target_box_width = target_box_width + 1 - - output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \ - prior_box_var[:,:,0] - output_box[:,:,1] = (target_box_y - prior_box_y) / prior_box_height / \ - prior_box_var[:,:,1] - output_box[:,:,2] = np.log(np.fabs(target_box_width / prior_box_width)) / \ - prior_box_var[:,:,2] - output_box[:,:,3] = np.log(np.fabs(target_box_height / prior_box_height)) / \ - prior_box_var[:,:,3] - - elif (code_type == "DecodeCenterSize"): - target_box_x = prior_box_var[:,:,0] * target_box[:,:,0] * \ - prior_box_width + prior_box_x - target_box_y = prior_box_var[:,:,1] * target_box[:,:,1] * \ - prior_box_height + prior_box_y - target_box_width = np.exp(prior_box_var[:,:,2] * target_box[:,:,2]) * \ - prior_box_width - target_box_height = np.exp(prior_box_var[:,:,3] * target_box[:,:,3]) * \ - prior_box_height - - output_box[:, :, 0] = target_box_x - target_box_width / 2 - output_box[:, :, 1] = target_box_y - target_box_height / 2 - output_box[:, :, 2] = target_box_x + target_box_width / 2 - output_box[:, :, 3] = target_box_y + target_box_height / 2 - if not box_normalized: - output_box[:, :, 2] = output_box[:, :, 2] - 1 - output_box[:, :, 3] = output_box[:, :, 3] - 1 - - -def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type, - box_normalized): - n = target_box.shape[0] - m = prior_box.shape[0] +def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0): + pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False) + pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False) + pb_x = pb_w * 0.5 + p_box[:, 0] + pb_y = pb_h * 0.5 + p_box[:, 1] + shape = (1, p_box.shape[0]) if axis == 0 else (p_box.shape[0], 1) + + pb_w = pb_w.reshape(shape) + pb_h = pb_h.reshape(shape) + pb_x = pb_x.reshape(shape) + pb_y = pb_y.reshape(shape) + + if pb_v.ndim == 2: + var_shape = (1, pb_v.shape[0], pb_v.shape[1]) if axis == 0 else ( + pb_v.shape[0], 1, pb_v.shape[1]) + pb_v = pb_v.reshape(var_shape) + if pb_v.ndim == 1: + tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x + tb_y = pb_v[1] * t_box[:, :, 1] * pb_h + pb_y + tb_w = np.exp(pb_v[2] * t_box[:, :, 2]) * pb_w + tb_h = np.exp(pb_v[3] * t_box[:, :, 3]) * pb_h + else: + tb_x = pb_v[:, :, 0] * t_box[:, :, 0] * pb_w + pb_x + tb_y = pb_v[:, :, 1] * t_box[:, :, 1] * pb_h + pb_y + tb_w = np.exp(pb_v[:, :, 2] * t_box[:, :, 2]) * pb_w + tb_h = np.exp(pb_v[:, :, 3] * t_box[:, :, 3]) * pb_h + output_box[:, :, 0] = tb_x - tb_w / 2 + output_box[:, :, 1] = tb_y - tb_h / 2 + output_box[:, :, 2] = tb_x + tb_w / 2 - (not norm) + output_box[:, :, 3] = tb_y + tb_h / 2 - (not norm) + + +def box_encoder(t_box, p_box, pb_v, output_box, norm): + pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False) + pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False) + pb_x = pb_w * 0.5 + p_box[:, 0] + pb_y = pb_h * 0.5 + p_box[:, 1] + shape = (1, p_box.shape[0]) + + pb_w = pb_w.reshape(shape) + pb_h = pb_h.reshape(shape) + pb_x = pb_x.reshape(shape) + pb_y = pb_y.reshape(shape) + + if pb_v.ndim == 2: + pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1]) + tb_x = ((t_box[:, 2] + t_box[:, 0]) / 2).reshape(t_box.shape[0], 1) + tb_y = ((t_box[:, 3] + t_box[:, 1]) / 2).reshape(t_box.shape[0], 1) + tb_w = (t_box[:, 2] - t_box[:, 0]).reshape(t_box.shape[0], 1) + (not norm) + tb_h = (t_box[:, 3] - t_box[:, 1]).reshape(t_box.shape[0], 1) + (not norm) + if pb_v.ndim == 1: + output_box[:, :, 0] = (tb_x - pb_x) / pb_w / pb_v[0] + output_box[:, :, 1] = (tb_y - pb_y) / pb_h / pb_v[1] + output_box[:, :, 2] = np.log(np.fabs(tb_w / pb_w)) / pb_v[2] + output_box[:, :, 3] = np.log(np.fabs(tb_h / pb_h)) / pb_v[3] + else: + output_box[:, :, 0] = (tb_x - pb_x) / pb_w / pb_v[:, :, 0] + output_box[:, :, 1] = (tb_y - pb_y) / pb_h / pb_v[:, :, 1] + output_box[:, :, 2] = np.log(np.fabs(tb_w / pb_w)) / pb_v[:, :, 2] + output_box[:, :, 3] = np.log(np.fabs(tb_h / pb_h)) / pb_v[:, :, 3] + + +def batch_box_coder(p_box, pb_v, t_box, lod, code_type, norm, axis=0): + n = t_box.shape[0] + m = p_box.shape[0] + if code_type == "DecodeCenterSize": + m = t_box.shape[1] output_box = np.zeros((n, m, 4), dtype=np.float32) cur_offset = 0 for i in range(len(lod)): if (code_type == "EncodeCenterSize"): - box_coder(target_box[cur_offset:(cur_offset + lod[i]), :], - prior_box, prior_box_var, - output_box[cur_offset:(cur_offset + lod[i]), :, :], - code_type, box_normalized) + box_encoder(t_box[cur_offset:(cur_offset + lod[i]), :], p_box, pb_v, + output_box[cur_offset:(cur_offset + lod[i]), :, :], + norm) elif (code_type == "DecodeCenterSize"): - box_coder(target_box[cur_offset:(cur_offset + lod[i]), :, :], - prior_box, prior_box_var, - output_box[cur_offset:(cur_offset + lod[i]), :, :], - code_type, box_normalized) + box_decoder(t_box, p_box, pb_v, output_box, norm, axis) cur_offset += lod[i] return output_box @@ -106,14 +108,13 @@ class TestBoxCoderOp(OpTest): def setUp(self): self.op_type = "box_coder" lod = [[1, 1, 1, 1, 1]] - prior_box = np.random.random((10, 4)).astype('float32') - prior_box_var = np.random.random((10, 4)).astype('float32') - target_box = np.random.random((5, 10, 4)).astype('float32') + prior_box = np.random.random((81, 4)).astype('float32') + prior_box_var = np.random.random((81, 4)).astype('float32') + target_box = np.random.random((20, 81, 4)).astype('float32') code_type = "DecodeCenterSize" box_normalized = False output_box = batch_box_coder(prior_box, prior_box_var, target_box, lod[0], code_type, box_normalized) - self.inputs = { 'PriorBox': prior_box, 'PriorBoxVar': prior_box_var, @@ -133,9 +134,9 @@ class TestBoxCoderOpWithoutBoxVar(OpTest): def setUp(self): self.op_type = "box_coder" lod = [[0, 1, 2, 3, 4, 5]] - prior_box = np.random.random((10, 4)).astype('float32') - prior_box_var = np.ones((10, 4)).astype('float32') - target_box = np.random.random((5, 10, 4)).astype('float32') + prior_box = np.random.random((81, 4)).astype('float32') + prior_box_var = np.ones((81, 4)).astype('float32') + target_box = np.random.random((20, 81, 4)).astype('float32') code_type = "DecodeCenterSize" box_normalized = False output_box = batch_box_coder(prior_box, prior_box_var, target_box, @@ -158,10 +159,10 @@ class TestBoxCoderOpWithLoD(OpTest): def setUp(self): self.op_type = "box_coder" - lod = [[4, 8, 8]] - prior_box = np.random.random((10, 4)).astype('float32') - prior_box_var = np.random.random((10, 4)).astype('float32') - target_box = np.random.random((20, 4)).astype('float32') + lod = [[10, 20, 20]] + prior_box = np.random.random((20, 4)).astype('float32') + prior_box_var = np.random.random((20, 4)).astype('float32') + target_box = np.random.random((50, 4)).astype('float32') code_type = "EncodeCenterSize" box_normalized = True output_box = batch_box_coder(prior_box, prior_box_var, target_box, @@ -176,5 +177,63 @@ class TestBoxCoderOpWithLoD(OpTest): self.outputs = {'OutputBox': output_box} +class TestBoxCoderOpWithAxis(OpTest): + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "box_coder" + lod = [[1, 1, 1, 1, 1]] + prior_box = np.random.random((30, 4)).astype('float32') + prior_box_var = np.random.random((30, 4)).astype('float32') + target_box = np.random.random((30, 81, 4)).astype('float32') + code_type = "DecodeCenterSize" + box_normalized = False + axis = 1 + output_box = batch_box_coder(prior_box, prior_box_var, target_box, + lod[0], code_type, box_normalized, axis) + + self.inputs = { + 'PriorBox': prior_box, + 'PriorBoxVar': prior_box_var, + 'TargetBox': target_box, + } + self.attrs = { + 'code_type': 'decode_center_size', + 'box_normalized': False, + 'axis': axis + } + self.outputs = {'OutputBox': output_box} + + +class TestBoxCoderOpWithVariance(OpTest): + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "box_coder" + lod = [[1, 1, 1, 1, 1]] + prior_box = np.random.random((30, 4)).astype('float32') + prior_box_var = np.random.random((4)).astype('float32') + target_box = np.random.random((30, 81, 4)).astype('float32') + code_type = "DecodeCenterSize" + box_normalized = False + axis = 1 + output_box = batch_box_coder(prior_box, prior_box_var, target_box, + lod[0], code_type, box_normalized, axis) + + self.inputs = { + 'PriorBox': prior_box, + 'TargetBox': target_box, + } + self.attrs = { + 'code_type': 'decode_center_size', + 'box_normalized': False, + 'variance': prior_box_var.astype(np.float).flatten(), + 'axis': axis + } + self.outputs = {'OutputBox': output_box} + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py index 754d5fd40953311a5deb466fa42216f72671a65a..603c8e74885d2a050e6e1e3101dce880b6eabe9c 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py @@ -16,12 +16,10 @@ import os import unittest os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" -from test_parallel_executor_transformer import TestTransformer - - -class EagerDeletionTestTransformer(TestTransformer): - pass +os.environ[ + 'RECORDIO_FILENAME'] = '/tmp/eager_deletion_transformer.wmt16.recordio' +from test_parallel_executor_transformer import TestTransformer if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index adf35c851bf05011223e483e472900a3d415e2ee..baaddf9f2e5b123300f1d083b33ea644665348fd 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -66,6 +66,128 @@ class MLP(fluid.imperative.Layer): return x +class SimpleRNNCell(fluid.imperative.Layer): + def __init__(self, step_input_size, hidden_size, output_size, param_attr): + super(SimpleRNNCell, self).__init__() + self.step_input_size = step_input_size + self.hidden_size = hidden_size + self.output_size = output_size + self._dype = core.VarDesc.VarType.FP32 + from paddle.fluid.layer_helper import LayerHelper + self._helper = LayerHelper( + 'SimpleRNNCell', act="tanh", param_attr=param_attr) + + def _build_once(self, inputs, pre_hidden): + i2h_param_shape = [self.step_input_size, self.hidden_size] + h2h_param_shape = [self.hidden_size, self.hidden_size] + h2o_param_shape = [self.output_size, self.hidden_size] + self._i2h_w = self._helper.create_parameter( + attr=self._helper.param_attr, + shape=i2h_param_shape, + dtype=self._dtype, + is_bias=False) + self._h2h_w = self._helper.create_parameter( + attr=self._helper.param_attr, + shape=h2h_param_shape, + dtype=self._dtype, + is_bias=False) + self._h2o_w = self._helper.create_parameter( + attr=self._helper.param_attr, + shape=h2o_param_shape, + dtype=self._dtype, + is_bias=False) + + def forward(self, input, pre_hidden): + + tmp_i2h = self._helper.create_variable_for_type_inference(self._dtype) + tmp_h2h = self._helper.create_variable_for_type_inference(self._dtype) + hidden = self._helper.create_variable_for_type_inference(self._dype) + out = self._helper.create_variable_for_type_inference(self._dype) + softmax_out = self._helper.create_variable_for_type_inference( + self._dtype) + reduce_out = self._helper.create_variable_for_type_inference( + self._dtype) + self._helper.append_op( + type="mul", + inputs={"X": input, + "Y": self._i2h_w}, + outputs={"Out": tmp_i2h}, + attrs={"x_num_col_dims": 1, + "y_num_col_dims": 1}) + + self._helper.append_op( + type="mul", + inputs={"X": pre_hidden, + "Y": self._h2h_w}, + outputs={"Out": tmp_h2h}, + attrs={"x_num_col_dims": 1, + "y_num_col_dims": 1}) + + self._helper.append_op( + type="elementwise_add", + inputs={'X': tmp_h2h, + 'Y': tmp_i2h}, + outputs={'Out': hidden}, + attrs={'axis': -1, + 'use_mkldnn': False}) + hidden = self._helper.append_activation(hidden) + + self._helper.append_op( + type="mul", + inputs={"X": hidden, + "Y": self._h2o_w}, + outputs={"Out": out}, + attrs={"x_num_col_dims": 1, + "y_num_col_dims": 1}) + + self._helper.append_op( + type="softmax", + inputs={"X": out}, + outputs={"Out": softmax_out}, + attrs={"use_cudnn": False}) + + self._helper.append_op( + type='reduce_sum', + inputs={'X': softmax_out}, + outputs={'Out': reduce_out}, + attrs={'dim': None, + 'keep_dim': False, + 'reduce_all': True}) + + return reduce_out, hidden + + +class SimpleRNN(fluid.imperative.Layer): + def __init__(self): + super(SimpleRNN, self).__init__() + self.seq_len = 4 + self._cell = SimpleRNNCell( + 3, + 3, + 3, + fluid.ParamAttr(initializer=fluid.initializer.Constant(value=0.1))) + + def forward(self, inputs): + outs = list() + pre_hiddens = list() + + init_hidden = fluid.layers.tensor.create_parameter( + attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1)), + shape=[1, 3], + dtype='float32', + is_bias=False) + pre_hidden = init_hidden + for i in range(self.seq_len): + input = fluid.layers.slice( + inputs, axes=[1], starts=[i], ends=[i + 1]) + input = fluid.layers.reshape(input, shape=[1, 3]) + out_softmax, pre_hidden = self._cell(input, pre_hidden) + outs.append(out_softmax) + + return outs, pre_hiddens + + class TestImperative(unittest.TestCase): def test_sum_op(self): x = np.ones([2, 2], np.float32) @@ -211,6 +333,41 @@ class TestImperative(unittest.TestCase): self.assertTrue(np.allclose(dy_out, static_out)) self.assertTrue(np.allclose(dy_grad, static_grad)) + def test_rnn(self): + np_inp = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0], + [10.0, 11.0, 12.0]]) + np_inp = np_inp.reshape((1, 4, 3)) + np_inp = np_inp.astype(np.float32) + with fluid.imperative.guard(): + var_inp = fluid.imperative.base.to_variable(np_inp) + var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3]) + simple_rnn = SimpleRNN() + outs, pre_hiddens = simple_rnn.forward(var_inp) + dy_out = outs[3]._numpy() + outs[3]._backward() + dy_grad_h2o = simple_rnn._cell._h2o_w._gradient() + dy_grad_h2h = simple_rnn._cell._h2h_w._gradient() + dy_grad_i2h = simple_rnn._cell._i2h_w._gradient() + + with new_program_scope(): + inp = fluid.layers.data( + name="inp", shape=[1, 4, 3], append_batch_size=False) + simple_rnn = SimpleRNN() + outs, pre_hiddens = simple_rnn(inp) + param_grads = fluid.backward.append_backward(outs[3]) + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + static_out, static_grad_h2o, static_grad_h2h, static_grad_i2h = exe.run( + feed={inp.name: np_inp}, + fetch_list=[ + outs[3].name, param_grads[0][1].name, + param_grads[1][1].name, param_grads[2][1].name + ]) + self.assertTrue(np.allclose(dy_out, static_out)) + self.assertTrue(np.allclose(dy_grad_h2o, static_grad_h2o)) + self.assertTrue(np.allclose(dy_grad_h2h, static_grad_h2h)) + self.assertTrue(np.allclose(dy_grad_i2h, static_grad_i2h)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index d0a5a883174cb33a035b344f9489b2ba02ba99f1..08b155acc657c3a4a73f5b1d72ac356fc7e83a58 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -82,13 +82,14 @@ class MNIST(fluid.imperative.Layer): self._simple_img_conv_pool_2 = SimpleImgConvPool( 20, 50, 5, 2, 2, act="relu") - pool_2_shape = 50 * 8 * 8 + pool_2_shape = 50 * 4 * 4 SIZE = 10 scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5 self._fc = FC(10, param_attr=fluid.param_attr.ParamAttr( initializer=fluid.initializer.NormalInitializer( - loc=0.0, scale=scale))) + loc=0.0, scale=scale)), + act="softmax") def forward(self, inputs): x = self._simple_img_conv_pool_1(inputs) @@ -98,9 +99,9 @@ class MNIST(fluid.imperative.Layer): class TestImperativeMnist(unittest.TestCase): - def test_mnist_cpu_float32(self): + def test_mnist_float32(self): seed = 90 - + batch_num = 2 with fluid.imperative.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed @@ -112,15 +113,15 @@ class TestImperativeMnist(unittest.TestCase): dy_param_init_value = {} for batch_id, data in enumerate(train_reader()): - if batch_id >= 2: + if batch_id >= batch_num: break - x_data = np.array( + dy_x_data = np.array( [x[0].reshape(1, 28, 28) for x in data]).astype('float32') y_data = np.array([x[1] for x in data]).astype('int64').reshape( 128, 1) - img = to_variable(x_data) + img = to_variable(dy_x_data) label = to_variable(y_data) label._stop_gradient = True @@ -136,6 +137,7 @@ class TestImperativeMnist(unittest.TestCase): avg_loss._backward() sgd.minimize(avg_loss) + mnist.clear_gradients() dy_param_value = {} for param in fluid.default_main_program().global_block( ).all_parameters(): @@ -175,10 +177,10 @@ class TestImperativeMnist(unittest.TestCase): static_param_init_value[static_param_name_list[i]] = out[i] for batch_id, data in enumerate(train_reader()): - if batch_id >= 2: + if batch_id >= batch_num: break - x_data = np.array( + static_x_data = np.array( [x[0].reshape(1, 28, 28) for x in data]).astype('float32') y_data = np.array([x[1] for x in data]).astype('int64').reshape( [128, 1]) @@ -186,7 +188,7 @@ class TestImperativeMnist(unittest.TestCase): fetch_list = [avg_loss.name] fetch_list.extend(static_param_name_list) out = exe.run(fluid.default_main_program(), - feed={"pixel": x_data, + feed={"pixel": static_x_data, "label": y_data}, fetch_list=fetch_list) @@ -196,11 +198,12 @@ class TestImperativeMnist(unittest.TestCase): static_param_value[static_param_name_list[i - 1]] = out[i] for key, value in six.iteritems(static_param_init_value): - self.assertTrue( - np.allclose(value.all(), dy_param_init_value[key].all())) - self.assertTrue(np.allclose(static_out.all(), dy_out.all())) + self.assertTrue(np.allclose(value, dy_param_init_value[key])) + + self.assertTrue(np.allclose(static_out, dy_out)) + for key, value in six.iteritems(static_param_value): - self.assertTrue(np.allclose(value.all(), dy_param_value[key].all())) + self.assertTrue(np.allclose(value, dy_param_value[key])) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py new file mode 100644 index 0000000000000000000000000000000000000000..afe990e74ff96dfbca4f335b561f9bbe7d295246 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py @@ -0,0 +1,350 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid as fluid +from paddle.fluid.imperative.nn import Embedding +import paddle.fluid.framework as framework +from paddle.fluid.optimizer import SGDOptimizer +from paddle.fluid.imperative.base import to_variable +from test_imperative_base import new_program_scope +import numpy as np +import six +from paddle.fluid.backward import append_backward + + +class SimpleLSTMRNN(fluid.imperative.Layer): + def __init__(self, + hidden_size, + num_steps, + num_layers=2, + init_scale=0.1, + dropout=None): + super(SimpleLSTMRNN, self).__init__() + self._hidden_size = hidden_size + self._num_layers = num_layers + self._init_scale = init_scale + self._dropout = dropout + self._input = None + self._num_steps = num_steps + + def _build_once(self, input_embedding, init_hidden=None, init_cell=None): + self.weight_1_arr = [] + self.weight_2_arr = [] + self.bias_arr = [] + self.hidden_array = [] + self.cell_array = [] + self.mask_array = [] + + for i in range(self._num_layers): + weight_1 = fluid.layers.create_parameter( + shape=[self._hidden_size * 2, self._hidden_size * 4], + dtype="float32", + name="fc_weight1_" + str(i), + default_initializer=fluid.initializer.UniformInitializer( + low=-self._init_scale, high=self._init_scale)) + self.weight_1_arr.append(weight_1) + bias_1 = fluid.layers.create_parameter( + [self._hidden_size * 4], + dtype="float32", + name="fc_bias1_" + str(i), + default_initializer=fluid.initializer.Constant(0.0)) + self.bias_arr.append(bias_1) + + pre_hidden = fluid.layers.slice( + init_hidden, axes=[0], starts=[i], ends=[i + 1]) + pre_cell = fluid.layers.slice( + init_cell, axes=[0], starts=[i], ends=[i + 1]) + pre_hidden = fluid.layers.reshape( + pre_hidden, shape=[-1, self._hidden_size]) + pre_cell = fluid.layers.reshape( + pre_cell, shape=[-1, self._hidden_size]) + self.hidden_array.append(pre_hidden) + self.cell_array.append(pre_cell) + + def parameters(self): + parameters = list() + for param in self.weight_1_arr: + parameters.append(param) + for param in self.weight_2_arr: + parameters.append(param) + for bias in self.bias_arr: + parameters.append(bias) + return parameters + + def forward(self, input_embedding, init_hidden=None, init_cell=None): + res = [] + for index in range(self._num_steps): + self._input = fluid.layers.slice( + input_embedding, axes=[1], starts=[index], ends=[index + 1]) + self._input = fluid.layers.reshape( + self._input, shape=[-1, self._hidden_size]) + for k in range(self._num_layers): + pre_hidden = self.hidden_array[k] + pre_cell = self.cell_array[k] + weight_1 = self.weight_1_arr[k] + bias = self.bias_arr[k] + + nn = fluid.layers.concat([self._input, pre_hidden], 1) + gate_input = fluid.layers.matmul(x=nn, y=weight_1) + + gate_input = fluid.layers.elementwise_add(gate_input, bias) + i, j, f, o = fluid.layers.split( + gate_input, num_or_sections=4, dim=-1) + c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid( + i) * fluid.layers.tanh(j) + m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o) + self.hidden_array[k] = m + self.cell_array[k] = c + self._input = m + + if self._dropout is not None and self._dropout > 0.0: + self._input = fluid.layers.dropout( + self._input, + dropout_prob=self._dropout, + dropout_implementation='upscale_in_train') + res.append( + fluid.layers.reshape( + self._input, shape=[1, -1, self._hidden_size])) + real_res = fluid.layers.concat(res, 0) + real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2]) + last_hidden = fluid.layers.concat(self.hidden_array, 1) + last_hidden = fluid.layers.reshape( + last_hidden, shape=[-1, self._num_layers, self._hidden_size]) + last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2]) + last_cell = fluid.layers.concat(self.cell_array, 1) + last_cell = fluid.layers.reshape( + last_cell, shape=[-1, self._num_layers, self._hidden_size]) + last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2]) + return real_res, last_hidden, last_cell + + +class PtbModel(fluid.imperative.Layer): + def __init__(self, + hidden_size, + vocab_size, + num_layers=2, + num_steps=20, + init_scale=0.1, + dropout=None): + super(PtbModel, self).__init__() + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.init_scale = init_scale + self.num_layers = num_layers + self.num_steps = num_steps + self.dropout = dropout + self.simple_lstm_rnn = SimpleLSTMRNN( + hidden_size, + num_steps, + num_layers=num_layers, + init_scale=init_scale, + dropout=dropout) + self.embedding = Embedding( + size=[vocab_size, hidden_size], + dtype='float32', + is_sparse=False, + param_attr=fluid.ParamAttr( + name='embedding_para', + initializer=fluid.initializer.UniformInitializer( + low=-init_scale, high=init_scale))) + self.softmax_weight = fluid.layers.create_parameter( + [self.hidden_size, self.vocab_size], + dtype="float32", + name="softmax_weight", + default_initializer=fluid.initializer.UniformInitializer( + low=-self.init_scale, high=self.init_scale)) + self.softmax_bias = fluid.layers.create_parameter( + [self.vocab_size], + dtype="float32", + name='softmax_bias', + default_initializer=fluid.initializer.UniformInitializer( + low=-self.init_scale, high=self.init_scale)) + + def _build_once(self, input, label, init_hidden, init_cell): + pass + + def parameters(self): + parameters = self.simple_lstm_rnn.parameters() + [ + self.softmax_weight, self.softmax_bias + ] + self.embedding.parameters() + return parameters + + def forward(self, input, label, init_hidden, init_cell): + + init_h = fluid.layers.reshape( + init_hidden, shape=[self.num_layers, -1, self.hidden_size]) + + init_c = fluid.layers.reshape( + init_cell, shape=[self.num_layers, -1, self.hidden_size]) + + x_emb = self.embedding(input) + x_emb = fluid.layers.reshape( + x_emb, shape=[-1, self.num_steps, self.hidden_size]) + if self.dropout is not None and self.dropout > 0.0: + x_emb = fluid.layers.dropout( + x_emb, + dropout_prob=self.drop_out, + dropout_implementation='upscale_in_train') + rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h, + init_c) + rnn_out = fluid.layers.reshape( + rnn_out, shape=[-1, self.num_steps, self.hidden_size]) + projection = fluid.layers.matmul(rnn_out, self.softmax_weight) + projection = fluid.layers.elementwise_add(projection, self.softmax_bias) + projection = fluid.layers.reshape( + projection, shape=[-1, self.vocab_size]) + projection = fluid.layers.reshape( + projection, shape=[-1, self.vocab_size]) + loss = fluid.layers.softmax_with_cross_entropy( + logits=projection, label=label, soft_label=False) + loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps]) + loss = fluid.layers.reduce_mean(loss, dim=[0]) + loss = fluid.layers.reduce_sum(loss) + loss.permissions = True + + return loss, last_hidden, last_cell + + +class TestImperativePtbRnn(unittest.TestCase): + def test_ptb_rnn_cpu_float32(self): + seed = 90 + hidden_size = 10 + vocab_size = 1000 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + + with fluid.imperative.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # TODO: marsyang1993 Change seed to + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + sgd = SGDOptimizer(learning_rate=1e-3) + dy_param_updated = dict() + dy_param_init = dict() + dy_loss = None + last_hidden = None + last_cell = None + for i in range(2): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + x_data = x_data.reshape((-1, num_steps, 1)) + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + x = to_variable(x_data) + y = to_variable(y_data) + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + if i == 0: + for param in ptb_model.parameters(): + dy_param_init[param.name] = param._numpy() + dy_loss._backward() + sgd.minimize(dy_loss) + for param in ptb_model.parameters(): + dy_param_updated[param.name] = param._numpy() + + with new_program_scope(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # TODO: marsyang1993 Change seed to + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + exe = fluid.Executor(fluid.CPUPlace()) + sgd = SGDOptimizer(learning_rate=1e-3) + x = fluid.layers.data(name="x", shape=[-1, 3, 1], dtype='int64') + y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32') + init_hidden = fluid.layers.data( + name="init_hidden", shape=[1], dtype='float32') + init_cell = fluid.layers.data( + name="init_cell", shape=[1], dtype='float32') + + static_loss, static_last_hidden, static_last_cell = ptb_model( + x, y, init_hidden, init_cell) + sgd.minimize(static_loss) + static_param_updated = dict() + static_param_init = dict() + static_param_name_list = list() + for param in ptb_model.parameters(): + static_param_name_list.append(param.name) + + out = exe.run(framework.default_startup_program(), + fetch_list=static_param_name_list) + for i in range(len(static_param_name_list)): + static_param_init[static_param_name_list[i]] = out[i] + static_loss_value = None + static_last_cell_value = None + static_last_hidden_value = None + for i in range(2): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + x_data = x_data.reshape((-1, num_steps, 1)) + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + fetch_list = [static_loss, static_last_hidden, static_last_cell] + fetch_list.extend(static_param_name_list) + out = exe.run(fluid.default_main_program(), + feed={ + "x": x_data, + "y": y_data, + "init_hidden": init_hidden_data, + "init_cell": init_cell_data + }, + fetch_list=fetch_list) + static_loss_value = out[0] + static_last_cell_value = out[1] + static_last_hidden_value = out[2] + for k in range(3, len(out)): + static_param_updated[static_param_name_list[k - 3]] = out[k] + + self.assertTrue( + np.allclose(static_loss_value.all(), dy_loss._numpy().all())) + self.assertTrue( + np.allclose(static_last_cell_value.all(), + last_cell._numpy().all())) + self.assertTrue( + np.allclose(static_last_hidden_value.all(), + last_hidden._numpy().all())) + for key, value in six.iteritems(static_param_init): + self.assertTrue( + np.allclose(value.all(), dy_param_init[key].all())) + for key, value in six.iteritems(static_param_updated): + self.assertTrue( + np.allclose(value.all(), dy_param_updated[key].all())) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py index 87a72dd04e376cf9225e275d862b0cbbb9774e2c..c27fd0b8024a8fa3310a62de34299fb621e2902f 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py @@ -264,6 +264,7 @@ class TestImperativeResnet(unittest.TestCase): )] = np_array optimizer.minimize(avg_loss) + resnet.clear_gradients() dy_param_value = {} for param in fluid.default_main_program().global_block( diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py index 9962702f69644b7aef7d868f086abb390441f617..9c9f86330704466c7a8801af6ab0fb2bba23f931 100644 --- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py +++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py @@ -25,6 +25,7 @@ import paddle.fluid.layers as layers import paddle.fluid.optimizer as optimizer from paddle.fluid.framework import Program, program_guard from paddle.fluid.io import save_inference_model, load_inference_model +from paddle.fluid.transpiler import memory_optimize class TestBook(unittest.TestCase): @@ -82,9 +83,36 @@ class TestBook(unittest.TestCase): self.assertEqual(feed_var_names, ["x", "y"]) self.assertEqual(len(fetch_vars), 1) - self.assertEqual(str(fetch_vars[0]), str(avg_cost)) + print("fetch %s" % str(fetch_vars[0])) + self.assertTrue("scale" in str(fetch_vars[0])) self.assertEqual(expected, actual) +class TestSaveInferenceModel(unittest.TestCase): + def test_save_inference_model(self): + MODEL_DIR = "./tmp/inference_model2" + init_program = Program() + program = Program() + + # fake program without feed/fetch + with program_guard(program, init_program): + x = layers.data(name='x', shape=[2], dtype='float32') + y = layers.data(name='y', shape=[1], dtype='float32') + + y_predict = layers.fc(input=x, size=1, act=None) + + cost = layers.square_error_cost(input=y_predict, label=y) + avg_cost = layers.mean(cost) + + place = core.CPUPlace() + exe = executor.Executor(place) + exe.run(init_program, feed={}, fetch_list=[]) + + memory_optimize(program, print_log=True) + self.assertEqual(program._is_mem_optimized, True) + # will print warning message + save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, program) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..4e196758efc990506957089fb5b88ebb099cca29 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py @@ -0,0 +1,76 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import unittest +import numpy as np +import paddle.fluid.core as core +import paddle.fluid as fluid +from parallel_executor_test_base import TestParallelExecutorBase + + +def fc_with_batchnorm(use_feed): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + hidden = img + for _ in range(3): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + + hidden = fluid.layers.batch_norm(input=hidden) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestIrInplace(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + + def _fc_with_batchnorm(self, + ir_memory_optimize, + enable_inplace, + memory_opt=False): + + if not core.is_compiled_with_cuda(): + return + np.random.seed(5) + img = np.random.random(size=[32, 784]).astype(np.float32) + label = np.ones(shape=[32, 1], dtype='int64') + self.check_network_convergence( + fc_with_batchnorm, + feed_dict={"image": img, + "label": label}, + use_cuda=True, + memory_opt=memory_opt, + use_ir_memory_optimize=ir_memory_optimize, + enable_inplace=enable_inplace) + + def test_fc_with_batchnorm(self, delta=1e-3): + loss00 = self._fc_with_batchnorm(False, False) + loss10 = self._fc_with_batchnorm(True, False) + loss01 = self._fc_with_batchnorm(False, True) + loss11 = self._fc_with_batchnorm(True, True) + self.assertAlmostEqual(loss00, loss10, delta=delta) + self.assertAlmostEqual(loss00, loss01, delta=delta) + self.assertAlmostEqual(loss00, loss11, delta=delta) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index c13f03e86f3e375026b04a31d51ac1a5223360ef..e7bc1601a54c8615e0e787d74145aa4987b6cb88 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -58,7 +58,8 @@ class TestBook(unittest.TestCase): def test_simple_conv2d(self): program = Program() with program_guard(program, startup_program=Program()): - images = layers.data(name='pixel', shape=[3, 48, 48], dtype='int32') + images = layers.data( + name='pixel', shape=[3, 48, 48], dtype='float32') layers.conv2d(input=images, num_filters=3, filter_size=[4, 4]) print(str(program)) diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py index 9778bd694de4b21f3ff723846c77a8ad0dceb57b..8fc391a1ff2529460b038979c0c7d0a9d905a7e0 100644 --- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py +++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py @@ -19,7 +19,7 @@ import copy from op_test import OpTest -def iou(box_a, box_b): +def iou(box_a, box_b, norm): """Apply intersection-over-union overlap between box_a and box_b """ xmin_a = min(box_a[0], box_a[2]) @@ -32,8 +32,10 @@ def iou(box_a, box_b): xmax_b = max(box_b[0], box_b[2]) ymax_b = max(box_b[1], box_b[3]) - area_a = (ymax_a - ymin_a) * (xmax_a - xmin_a) - area_b = (ymax_b - ymin_b) * (xmax_b - xmin_b) + area_a = (ymax_a - ymin_a + (norm == False)) * (xmax_a - xmin_a + + (norm == False)) + area_b = (ymax_b - ymin_b + (norm == False)) * (xmax_b - xmin_b + + (norm == False)) if area_a <= 0 and area_b <= 0: return 0.0 @@ -42,17 +44,21 @@ def iou(box_a, box_b): xb = min(xmax_a, xmax_b) yb = min(ymax_a, ymax_b) - inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0) - - box_a_area = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]) - box_b_area = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]) + inter_area = max(xb - xa + (norm == False), + 0.0) * max(yb - ya + (norm == False), 0.0) iou_ratio = inter_area / (area_a + area_b - inter_area) return iou_ratio -def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0): +def nms(boxes, + scores, + score_threshold, + nms_threshold, + top_k=200, + normalized=True, + eta=1.0): """Apply non-maximum suppression at test time to avoid detecting too many overlapping bounding boxes for a given object. Args: @@ -87,7 +93,7 @@ def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0): for k in range(len(selected_indices)): if keep: kept_idx = selected_indices[k] - overlap = iou(boxes[idx], boxes[kept_idx]) + overlap = iou(boxes[idx], boxes[kept_idx], normalized) keep = True if overlap <= adaptive_threshold else False else: break @@ -99,16 +105,24 @@ def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0): def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold, - nms_top_k, keep_top_k): - class_num = scores.shape[0] - priorbox_num = scores.shape[1] + nms_top_k, keep_top_k, normalized, shared): + if shared: + class_num = scores.shape[0] + priorbox_num = scores.shape[1] + else: + box_num = scores.shape[0] + class_num = scores.shape[1] selected_indices = {} num_det = 0 for c in range(class_num): if c == background: continue - indices = nms(boxes, scores[c], score_threshold, nms_threshold, - nms_top_k) + if shared: + indices = nms(boxes, scores[c], score_threshold, nms_threshold, + nms_top_k, normalized) + else: + indices = nms(boxes[:, c, :], scores[:, c], score_threshold, + nms_threshold, nms_top_k, normalized) selected_indices[c] = indices num_det += len(indices) @@ -116,7 +130,10 @@ def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold, score_index = [] for c, indices in selected_indices.items(): for idx in indices: - score_index.append((scores[c][idx], c, idx)) + if shared: + score_index.append((scores[c][idx], c, idx)) + else: + score_index.append((scores[idx][c], c, idx)) sorted_score_index = sorted( score_index, key=lambda tup: tup[0], reverse=True) @@ -127,24 +144,75 @@ def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold, selected_indices[c] = [] for s, c, idx in sorted_score_index: selected_indices[c].append(idx) + if not shared: + for labels in selected_indices: + selected_indices[labels].sort() num_det = keep_top_k return selected_indices, num_det -def batched_multiclass_nms(boxes, scores, background, score_threshold, - nms_threshold, nms_top_k, keep_top_k): +def lod_multiclass_nms(boxes, scores, background, score_threshold, + nms_threshold, nms_top_k, keep_top_k, box_lod, + normalized): + det_outs = [] + lod = [] + head = 0 + for n in range(len(box_lod[0])): + box = boxes[head:head + box_lod[0][n]] + score = scores[head:head + box_lod[0][n]] + head = head + box_lod[0][n] + nmsed_outs, nmsed_num = multiclass_nms( + box, + score, + background, + score_threshold, + nms_threshold, + nms_top_k, + keep_top_k, + normalized, + shared=False) + if nmsed_num == 0: + #lod.append(1) + continue + lod.append(nmsed_num) + for c, indices in nmsed_outs.items(): + for idx in indices: + xmin, ymin, xmax, ymax = box[idx, c, :] + det_outs.append([c, score[idx][c], xmin, ymin, xmax, ymax]) + if len(lod) == 0: + lod.append(1) + + return det_outs, lod + + +def batched_multiclass_nms(boxes, + scores, + background, + score_threshold, + nms_threshold, + nms_top_k, + keep_top_k, + normalized=True): batch_size = scores.shape[0] det_outs = [] lod = [] for n in range(batch_size): - nmsed_outs, nmsed_num = multiclass_nms(boxes[n], scores[n], background, - score_threshold, nms_threshold, - nms_top_k, keep_top_k) - lod.append(nmsed_num) - if nmsed_num == 0: continue + nmsed_outs, nmsed_num = multiclass_nms( + boxes[n], + scores[n], + background, + score_threshold, + nms_threshold, + nms_top_k, + keep_top_k, + normalized, + shared=True) + if nmsed_num == 0: + continue + lod.append(nmsed_num) tmp_det_out = [] for c, indices in nmsed_outs.items(): for idx in indices: @@ -154,7 +222,8 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold, sorted_det_out = sorted( tmp_det_out, key=lambda tup: tup[0], reverse=False) det_outs.extend(sorted_det_out) - + if len(lod) == 0: + lod += [1] return det_outs, lod @@ -168,7 +237,6 @@ class TestMulticlassNMSOp(OpTest): M = 1200 C = 21 BOX_SIZE = 4 - background = 0 nms_threshold = 0.3 nms_top_k = 400 @@ -206,6 +274,7 @@ class TestMulticlassNMSOp(OpTest): 'keep_top_k': keep_top_k, 'score_threshold': score_threshold, 'nms_eta': 1.0, + 'normalized': True, } def test_check_output(self): @@ -219,13 +288,70 @@ class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp): self.score_threshold = 2.0 +class TestMulticlassNMSLoDInput(OpTest): + def set_argument(self): + self.score_threshold = 0.01 + + def setUp(self): + self.set_argument() + M = 1200 + C = 21 + BOX_SIZE = 4 + box_lod = [[1200]] + background = 0 + nms_threshold = 0.3 + nms_top_k = 400 + keep_top_k = 200 + score_threshold = self.score_threshold + normalized = False + + scores = np.random.random((M, C)).astype('float32') + + def softmax(x): + shiftx = x - np.max(x).clip(-64.) + exps = np.exp(shiftx) + return exps / np.sum(exps) + + scores = np.apply_along_axis(softmax, 1, scores) + + boxes = np.random.random((M, C, BOX_SIZE)).astype('float32') + boxes[:, :, 0] = boxes[:, :, 0] * 10 + boxes[:, :, 1] = boxes[:, :, 1] * 10 + boxes[:, :, 2] = boxes[:, :, 2] * 10 + 10 + boxes[:, :, 3] = boxes[:, :, 3] * 10 + 10 + + nmsed_outs, lod = lod_multiclass_nms( + boxes, scores, background, score_threshold, nms_threshold, + nms_top_k, keep_top_k, box_lod, normalized) + nmsed_outs = [-1] if not nmsed_outs else nmsed_outs + nmsed_outs = np.array(nmsed_outs).astype('float32') + self.op_type = 'multiclass_nms' + self.inputs = { + 'BBoxes': (boxes, box_lod), + 'Scores': (scores, box_lod), + } + self.outputs = {'Out': (nmsed_outs, [lod])} + self.attrs = { + 'background_label': 0, + 'nms_threshold': nms_threshold, + 'nms_top_k': nms_top_k, + 'keep_top_k': keep_top_k, + 'score_threshold': score_threshold, + 'nms_eta': 1.0, + 'normalized': normalized, + } + + def test_check_output(self): + self.check_output() + + class TestIOU(unittest.TestCase): def test_iou(self): box1 = np.array([4.0, 3.0, 7.0, 5.0]).astype('float32') box2 = np.array([3.0, 4.0, 6.0, 8.0]).astype('float32') expt_output = np.array([2.0 / 16.0]).astype('float32') - calc_output = np.array([iou(box1, box2)]).astype('float32') + calc_output = np.array([iou(box1, box2, True)]).astype('float32') self.assertTrue(np.allclose(calc_output, expt_output)) diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py index 242709425f2d3f190d3c1ed795d30938fb8e23fe..5bb2260ef7a143670dd75fc88769603d1437173d 100644 --- a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py +++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py @@ -24,7 +24,8 @@ def nearest_neighbor_interp_np(X, out_h, out_w, out_size=None, - actual_shape=None): + actual_shape=None, + align_corners=True): """nearest neighbor interpolation implement in shape [N, C, H, W]""" if out_size is not None: out_h = out_size[0] @@ -35,17 +36,31 @@ def nearest_neighbor_interp_np(X, n, c, in_h, in_w = X.shape ratio_h = ratio_w = 0.0 - if out_h > 1: - ratio_h = (in_h - 1.0) / (out_h - 1.0) - if out_w > 1: - ratio_w = (in_w - 1.0) / (out_w - 1.0) + if (out_h > 1): + if (align_corners): + ratio_h = (in_h - 1.0) / (out_h - 1.0) + else: + ratio_h = 1.0 * in_h / out_h + if (out_w > 1): + if (align_corners): + ratio_w = (in_w - 1.0) / (out_w - 1.0) + else: + ratio_w = 1.0 * in_w / out_w out = np.zeros((n, c, out_h, out_w)) - for i in range(out_h): - in_i = int(ratio_h * i + 0.5) - for j in range(out_w): - in_j = int(ratio_w * j + 0.5) - out[:, :, i, j] = X[:, :, in_i, in_j] + + if align_corners: + for i in range(out_h): + in_i = int(ratio_h * i + 0.5) + for j in range(out_w): + in_j = int(ratio_w * j + 0.5) + out[:, :, i, j] = X[:, :, in_i, in_j] + else: + for i in range(out_h): + in_i = int(ratio_h * i) + for j in range(out_w): + in_j = int(ratio_w * j) + out[:, :, i, j] = X[:, :, in_i, in_j] return out.astype(X.dtype) @@ -59,7 +74,8 @@ class TestNearestInterpOp(OpTest): input_np = np.random.random(self.input_shape).astype("float32") output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w, - self.out_size, self.actual_shape) + self.out_size, self.actual_shape, + self.align_corners) self.inputs = {'X': input_np} if self.out_size is not None: self.inputs['OutSize'] = self.out_size @@ -68,7 +84,8 @@ class TestNearestInterpOp(OpTest): self.attrs = { 'out_h': self.out_h, 'out_w': self.out_w, - 'interp_method': self.interp_method + 'interp_method': self.interp_method, + 'align_corners': self.align_corners, } self.outputs = {'Out': output_np} @@ -84,6 +101,7 @@ class TestNearestInterpOp(OpTest): self.out_h = 2 self.out_w = 2 self.out_size = np.array([3, 3]).astype("int32") + self.align_corners = True class TestNearestNeighborInterpCase1(TestNearestInterpOp): @@ -92,6 +110,7 @@ class TestNearestNeighborInterpCase1(TestNearestInterpOp): self.input_shape = [4, 1, 7, 8] self.out_h = 1 self.out_w = 1 + self.align_corners = True class TestNearestNeighborInterpCase2(TestNearestInterpOp): @@ -100,6 +119,7 @@ class TestNearestNeighborInterpCase2(TestNearestInterpOp): self.input_shape = [3, 3, 9, 6] self.out_h = 12 self.out_w = 12 + self.align_corners = True class TestNearestNeighborInterpCase3(TestNearestInterpOp): @@ -108,6 +128,7 @@ class TestNearestNeighborInterpCase3(TestNearestInterpOp): self.input_shape = [1, 1, 128, 64] self.out_h = 64 self.out_w = 128 + self.align_corners = True class TestNearestNeighborInterpCase4(TestNearestInterpOp): @@ -117,6 +138,7 @@ class TestNearestNeighborInterpCase4(TestNearestInterpOp): self.out_h = 1 self.out_w = 1 self.out_size = np.array([2, 2]).astype("int32") + self.align_corners = True class TestNearestNeighborInterpCase5(TestNearestInterpOp): @@ -126,6 +148,7 @@ class TestNearestNeighborInterpCase5(TestNearestInterpOp): self.out_h = 12 self.out_w = 12 self.out_size = np.array([11, 11]).astype("int32") + self.align_corners = True class TestNearestNeighborInterpCase6(TestNearestInterpOp): @@ -135,6 +158,7 @@ class TestNearestNeighborInterpCase6(TestNearestInterpOp): self.out_h = 64 self.out_w = 128 self.out_size = np.array([65, 129]).astype("int32") + self.align_corners = True class TestNearestNeighborInterpActualShape(TestNearestInterpOp): @@ -144,6 +168,7 @@ class TestNearestNeighborInterpActualShape(TestNearestInterpOp): self.out_h = 64 self.out_w = 32 self.out_size = np.array([66, 40]).astype("int32") + self.align_corners = True class TestNearestInterpOpUint8(OpTest): @@ -155,14 +180,16 @@ class TestNearestInterpOpUint8(OpTest): input_np = np.random.randint( low=0, high=256, size=self.input_shape).astype("uint8") output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w, - self.out_size, self.actual_shape) + self.out_size, self.actual_shape, + self.align_corners) self.inputs = {'X': input_np} if self.out_size is not None: self.inputs['OutSize'] = self.out_size self.attrs = { 'out_h': self.out_h, 'out_w': self.out_w, - 'interp_method': self.interp_method + 'interp_method': self.interp_method, + 'align_corners': self.align_corners } self.outputs = {'Out': output_np} @@ -174,6 +201,7 @@ class TestNearestInterpOpUint8(OpTest): self.input_shape = [1, 3, 9, 6] self.out_h = 10 self.out_w = 9 + self.align_corners = True class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8): @@ -182,6 +210,7 @@ class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8): self.input_shape = [2, 3, 128, 64] self.out_h = 120 self.out_w = 50 + self.align_corners = True class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8): @@ -191,6 +220,12 @@ class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8): self.out_h = 5 self.out_w = 13 self.out_size = np.array([6, 15]).astype("int32") + self.align_corners = True + + +class TestNearestInterpWithoutCorners(TestNearestInterpOp): + def set_align_corners(self): + self.align_corners = False if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index e7a56bb6386a812e43e5c1b5c08cd0682aa9223a..9548598d75367ed1f1863d1f6ae50b83d58f8c7f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -200,7 +200,7 @@ class TestResnet(TestParallelExecutorBase): model, use_cuda, iter=20, - delta2=1e-6): + delta2=1e-5): if use_cuda and not core.is_compiled_with_cuda(): return @@ -228,7 +228,7 @@ class TestResnet(TestParallelExecutorBase): optimizer=optimizer) for loss in zip(all_reduce_first_loss, reduce_first_loss): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) for loss in zip(all_reduce_last_loss, reduce_last_loss): self.assertAlmostEquals(loss[0], loss[1], delta=delta2) @@ -258,17 +258,17 @@ class TestResnet(TestParallelExecutorBase): enable_sequential_execution=True) for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) for loss in zip(all_reduce_last_loss, all_reduce_last_loss_seq): self.assertAlmostEquals(loss[0], loss[1], delta=delta2) for loss in zip(reduce_first_loss, reduce_first_loss_seq): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) for loss in zip(reduce_last_loss, reduce_last_loss_seq): self.assertAlmostEquals(loss[0], loss[1], delta=delta2) for loss in zip(all_reduce_first_loss_seq, reduce_first_loss_seq): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq): self.assertAlmostEquals(loss[0], loss[1], delta=delta2) @@ -277,7 +277,7 @@ class TestResnet(TestParallelExecutorBase): use_cuda=True, use_reduce=False, iter=20, - delta2=1e-6): + delta2=1e-5): if use_cuda and not core.is_compiled_with_cuda(): return @@ -308,7 +308,7 @@ class TestResnet(TestParallelExecutorBase): optimizer=optimizer) self.assertAlmostEquals( - np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6) + np.mean(parallel_first_loss), single_first_loss[0], delta=1e-5) self.assertAlmostEquals( np.mean(parallel_last_loss), single_last_loss[0], delta=delta2) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index 3827743908c1d76931572277323d1dd5ddd05523..aacc1c3ecda8c25dec9f08827a856d38c37b1b2f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -24,7 +24,7 @@ import paddle.fluid.core as core import paddle.dataset.wmt16 as wmt16 import os -WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio" +WMT16_RECORDIO_FILE = os.environ.get('RECORDIO_FILENAME', '/tmp/wmt16.recordio') class ModelHyperParams(object): diff --git a/python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py b/python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py new file mode 100644 index 0000000000000000000000000000000000000000..3673fd10c4d14ba8d7a9644dcc024f58cf92a099 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py @@ -0,0 +1,59 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import os +os.environ['FLAGS_benchmark'] = 'True' + +import numpy +import paddle.fluid.core as core +from paddle.fluid.executor import Executor +from paddle.fluid.layers import mul, data + + +class TestPeakMemoryMonitoring(unittest.TestCase): + def test_mul(self): + + a = data(name='a', shape=[784], dtype='float32') + b = data( + name='b', + shape=[784, 100], + dtype='float32', + append_batch_size=False) + out = mul(x=a, y=b) + + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + + a_np = numpy.random.random((100, 784)).astype('float32') + b_np = numpy.random.random((784, 100)).astype('float32') + self.assertEqual(0, core.get_mem_usage(0)) + exe = Executor(place) + outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out]) + out = outs[0] + #disable this assert since ctest will ignore the os.environ setting + #self.assertGreater(core.get_mem_usage(0), 0) + + raised = False + try: + core.print_mem_usage() + except: + raised = True + self.assertFalse(raised, 'Exception raised') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py index 544fe4b4f81909b69a05d9751316e3d3137fdc45..020c1139230a9177c4d7765367359d91839d7d46 100644 --- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -16,174 +16,179 @@ from __future__ import division import unittest import numpy as np +from scipy.special import logit +from scipy.special import expit from op_test import OpTest from paddle.fluid import core -def sigmoid(x): - return 1.0 / (1.0 + np.exp(-1.0 * x)) +def l2loss(x, y): + return 0.5 * (y - x) * (y - x) -def mse(x, y, num): - return ((y - x)**2).sum() / num +def sce(x, label): + sigmoid_x = expit(x) + term1 = label * np.log(sigmoid_x) + term2 = (1.0 - label) * np.log(1.0 - sigmoid_x) + return -term1 - term2 -def bce(x, y, mask): - x = x.reshape((-1)) - y = y.reshape((-1)) - mask = mask.reshape((-1)) +def sigmoid(x): + return 1.0 / (1.0 + np.exp(-1.0 * x)) - error_sum = 0.0 - count = 0 - for i in range(x.shape[0]): - if mask[i] > 0: - error_sum += y[i] * np.log(x[i]) + (1 - y[i]) * np.log(1 - x[i]) - count += 1 - return error_sum / (-1.0 * count) +def batch_xywh_box_iou(box1, box2): + b1_left = box1[:, :, 0] - box1[:, :, 2] / 2 + b1_right = box1[:, :, 0] + box1[:, :, 2] / 2 + b1_top = box1[:, :, 1] - box1[:, :, 3] / 2 + b1_bottom = box1[:, :, 1] + box1[:, :, 3] / 2 -def box_iou(box1, box2): - b1_x1 = box1[0] - box1[2] / 2 - b1_x2 = box1[0] + box1[2] / 2 - b1_y1 = box1[1] - box1[3] / 2 - b1_y2 = box1[1] + box1[3] / 2 - b2_x1 = box2[0] - box2[2] / 2 - b2_x2 = box2[0] + box2[2] / 2 - b2_y1 = box2[1] - box2[3] / 2 - b2_y2 = box2[1] + box2[3] / 2 + b2_left = box2[:, :, 0] - box2[:, :, 2] / 2 + b2_right = box2[:, :, 0] + box2[:, :, 2] / 2 + b2_top = box2[:, :, 1] - box2[:, :, 3] / 2 + b2_bottom = box2[:, :, 1] + box2[:, :, 3] / 2 - b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1) - b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + left = np.maximum(b1_left[:, :, np.newaxis], b2_left[:, np.newaxis, :]) + right = np.minimum(b1_right[:, :, np.newaxis], b2_right[:, np.newaxis, :]) + top = np.maximum(b1_top[:, :, np.newaxis], b2_top[:, np.newaxis, :]) + bottom = np.minimum(b1_bottom[:, :, np.newaxis], + b2_bottom[:, np.newaxis, :]) - inter_rect_x1 = max(b1_x1, b2_x1) - inter_rect_y1 = max(b1_y1, b2_y1) - inter_rect_x2 = min(b1_x2, b2_x2) - inter_rect_y2 = min(b1_y2, b2_y2) - inter_area = max(inter_rect_x2 - inter_rect_x1, 0) * max( - inter_rect_y2 - inter_rect_y1, 0) + inter_w = np.clip(right - left, 0., 1.) + inter_h = np.clip(bottom - top, 0., 1.) + inter_area = inter_w * inter_h - return inter_area / (b1_area + b2_area + inter_area) + b1_area = (b1_right - b1_left) * (b1_bottom - b1_top) + b2_area = (b2_right - b2_left) * (b2_bottom - b2_top) + union = b1_area[:, :, np.newaxis] + b2_area[:, np.newaxis, :] - inter_area + return inter_area / union -def build_target(gtboxs, gtlabel, attrs, grid_size): - n, b, _ = gtboxs.shape - ignore_thresh = attrs["ignore_thresh"] - anchors = attrs["anchors"] - class_num = attrs["class_num"] - an_num = len(anchors) // 2 - obj_mask = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') - noobj_mask = np.ones((n, an_num, grid_size, grid_size)).astype('float32') - tx = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') - ty = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') - tw = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') - th = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') - tconf = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') - tcls = np.zeros( - (n, an_num, grid_size, grid_size, class_num)).astype('float32') +def YOLOv3Loss(x, gtbox, gtlabel, attrs): + n, c, h, w = x.shape + b = gtbox.shape[1] + anchors = attrs['anchors'] + an_num = len(anchors) // 2 + anchor_mask = attrs['anchor_mask'] + mask_num = len(anchor_mask) + class_num = attrs["class_num"] + ignore_thresh = attrs['ignore_thresh'] + downsample = attrs['downsample'] + input_size = downsample * h + x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2)) + loss = np.zeros((n)).astype('float32') + + pred_box = x[:, :, :, :, :4].copy() + grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1)) + grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w)) + pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w + pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h + + x[:, :, :, :, 5:] = np.where(x[:, :, :, :, 5:] < -0.5, x[:, :, :, :, 5:], + np.ones_like(x[:, :, :, :, 5:]) * 1.0 / + class_num) + + mask_anchors = [] + for m in anchor_mask: + mask_anchors.append((anchors[2 * m], anchors[2 * m + 1])) + anchors_s = np.array( + [(an_w / input_size, an_h / input_size) for an_w, an_h in mask_anchors]) + anchor_w = anchors_s[:, 0:1].reshape((1, mask_num, 1, 1)) + anchor_h = anchors_s[:, 1:2].reshape((1, mask_num, 1, 1)) + pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w + pred_box[:, :, :, :, 3] = np.exp(pred_box[:, :, :, :, 3]) * anchor_h + + pred_box = pred_box.reshape((n, -1, 4)) + pred_obj = x[:, :, :, :, 4].reshape((n, -1)) + objness = np.zeros(pred_box.shape[:2]).astype('float32') + ious = batch_xywh_box_iou(pred_box, gtbox) + ious_max = np.max(ious, axis=-1) + objness = np.where(ious_max > ignore_thresh, -np.ones_like(objness), + objness) + + gtbox_shift = gtbox.copy() + gtbox_shift[:, :, 0] = 0 + gtbox_shift[:, :, 1] = 0 + + anchors = [(anchors[2 * i], anchors[2 * i + 1]) for i in range(0, an_num)] + anchors_s = np.array( + [(an_w / input_size, an_h / input_size) for an_w, an_h in anchors]) + anchor_boxes = np.concatenate( + [np.zeros_like(anchors_s), anchors_s], axis=-1) + anchor_boxes = np.tile(anchor_boxes[np.newaxis, :, :], (n, 1, 1)) + ious = batch_xywh_box_iou(gtbox_shift, anchor_boxes) + iou_matches = np.argmax(ious, axis=-1) + gt_matches = iou_matches.copy() for i in range(n): for j in range(b): - if gtboxs[i, j, :].sum() == 0: + if gtbox[i, j, 2:].sum() == 0: + gt_matches[i, j] = -1 continue + if iou_matches[i, j] not in anchor_mask: + gt_matches[i, j] = -1 + continue + an_idx = anchor_mask.index(iou_matches[i, j]) + gt_matches[i, j] = an_idx + gi = int(gtbox[i, j, 0] * w) + gj = int(gtbox[i, j, 1] * h) - gt_label = gtlabel[i, j] - gx = gtboxs[i, j, 0] * grid_size - gy = gtboxs[i, j, 1] * grid_size - gw = gtboxs[i, j, 2] * grid_size - gh = gtboxs[i, j, 3] * grid_size - - gi = int(gx) - gj = int(gy) - - gtbox = [0, 0, gw, gh] - max_iou = 0 - for k in range(an_num): - anchor_box = [0, 0, anchors[2 * k], anchors[2 * k + 1]] - iou = box_iou(gtbox, anchor_box) - if iou > max_iou: - max_iou = iou - best_an_index = k - if iou > ignore_thresh: - noobj_mask[i, best_an_index, gj, gi] = 0 - - obj_mask[i, best_an_index, gj, gi] = 1 - noobj_mask[i, best_an_index, gj, gi] = 0 - tx[i, best_an_index, gj, gi] = gx - gi - ty[i, best_an_index, gj, gi] = gy - gj - tw[i, best_an_index, gj, gi] = np.log(gw / anchors[2 * - best_an_index]) - th[i, best_an_index, gj, gi] = np.log( - gh / anchors[2 * best_an_index + 1]) - tconf[i, best_an_index, gj, gi] = 1 - tcls[i, best_an_index, gj, gi, gt_label] = 1 - - return (tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask) - - -def YoloV3Loss(x, gtbox, gtlabel, attrs): - n, c, h, w = x.shape - an_num = len(attrs['anchors']) // 2 - class_num = attrs["class_num"] - x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2)) - pred_x = sigmoid(x[:, :, :, :, 0]) - pred_y = sigmoid(x[:, :, :, :, 1]) - pred_w = x[:, :, :, :, 2] - pred_h = x[:, :, :, :, 3] - pred_conf = sigmoid(x[:, :, :, :, 4]) - pred_cls = sigmoid(x[:, :, :, :, 5:]) - - tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask = build_target( - gtbox, gtlabel, attrs, x.shape[2]) - - obj_mask_expand = np.tile( - np.expand_dims(obj_mask, 4), (1, 1, 1, 1, int(attrs['class_num']))) - loss_x = mse(pred_x * obj_mask, tx * obj_mask, obj_mask.sum()) - loss_y = mse(pred_y * obj_mask, ty * obj_mask, obj_mask.sum()) - loss_w = mse(pred_w * obj_mask, tw * obj_mask, obj_mask.sum()) - loss_h = mse(pred_h * obj_mask, th * obj_mask, obj_mask.sum()) - loss_conf_target = bce(pred_conf * obj_mask, tconf * obj_mask, obj_mask) - loss_conf_notarget = bce(pred_conf * noobj_mask, tconf * noobj_mask, - noobj_mask) - loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand, - obj_mask_expand) - - return attrs['loss_weight_xy'] * (loss_x + loss_y) \ - + attrs['loss_weight_wh'] * (loss_w + loss_h) \ - + attrs['loss_weight_conf_target'] * loss_conf_target \ - + attrs['loss_weight_conf_notarget'] * loss_conf_notarget \ - + attrs['loss_weight_class'] * loss_class + tx = gtbox[i, j, 0] * w - gi + ty = gtbox[i, j, 1] * w - gj + tw = np.log(gtbox[i, j, 2] * input_size / mask_anchors[an_idx][0]) + th = np.log(gtbox[i, j, 3] * input_size / mask_anchors[an_idx][1]) + scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3]) + loss[i] += sce(x[i, an_idx, gj, gi, 0], tx) * scale + loss[i] += sce(x[i, an_idx, gj, gi, 1], ty) * scale + loss[i] += l2loss(x[i, an_idx, gj, gi, 2], tw) * scale + loss[i] += l2loss(x[i, an_idx, gj, gi, 3], th) * scale + + objness[i, an_idx * h * w + gj * w + gi] = 1.0 + + for label_idx in range(class_num): + loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx], + float(label_idx == gtlabel[i, j])) + + for j in range(mask_num * h * w): + if objness[i, j] > 0: + loss[i] += sce(pred_obj[i, j], 1.0) + elif objness[i, j] == 0: + loss[i] += sce(pred_obj[i, j], 0.0) + + return (loss, objness.reshape((n, mask_num, h, w)).astype('float32'), \ + gt_matches.astype('int32')) class TestYolov3LossOp(OpTest): def setUp(self): - self.loss_weight_xy = 1.0 - self.loss_weight_wh = 1.0 - self.loss_weight_conf_target = 1.0 - self.loss_weight_conf_notarget = 1.0 - self.loss_weight_class = 1.0 self.initTestCase() self.op_type = 'yolov3_loss' - x = np.random.random(size=self.x_shape).astype('float32') + x = logit(np.random.uniform(0, 1, self.x_shape).astype('float32')) gtbox = np.random.random(size=self.gtbox_shape).astype('float32') - gtlabel = np.random.randint(0, self.class_num, - self.gtbox_shape[:2]).astype('int32') + gtlabel = np.random.randint(0, self.class_num, self.gtbox_shape[:2]) + gtmask = np.random.randint(0, 2, self.gtbox_shape[:2]) + gtbox = gtbox * gtmask[:, :, np.newaxis] + gtlabel = gtlabel * gtmask self.attrs = { "anchors": self.anchors, + "anchor_mask": self.anchor_mask, "class_num": self.class_num, "ignore_thresh": self.ignore_thresh, - "loss_weight_xy": self.loss_weight_xy, - "loss_weight_wh": self.loss_weight_wh, - "loss_weight_conf_target": self.loss_weight_conf_target, - "loss_weight_conf_notarget": self.loss_weight_conf_notarget, - "loss_weight_class": self.loss_weight_class, + "downsample": self.downsample, } - self.inputs = {'X': x, 'GTBox': gtbox, 'GTLabel': gtlabel} + self.inputs = { + 'X': x, + 'GTBox': gtbox.astype('float32'), + 'GTLabel': gtlabel.astype('int32'), + } + loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, self.attrs) self.outputs = { - 'Loss': np.array( - [YoloV3Loss(x, gtbox, gtlabel, self.attrs)]).astype('float32') + 'Loss': loss, + 'ObjectnessMask': objness, + "GTMatchMask": gt_matches } def test_check_output(self): @@ -196,19 +201,16 @@ class TestYolov3LossOp(OpTest): place, ['X'], 'Loss', no_grad_set=set(["GTBox", "GTLabel"]), - max_relative_error=0.06) + max_relative_error=0.3) def initTestCase(self): - self.anchors = [10, 13, 12, 12] - self.class_num = 10 + self.anchors = [10, 13, 16, 30, 33, 23] + self.anchor_mask = [1, 2] + self.class_num = 5 self.ignore_thresh = 0.5 - self.x_shape = (5, len(self.anchors) // 2 * (5 + self.class_num), 7, 7) - self.gtbox_shape = (5, 10, 4) - self.loss_weight_xy = 2.5 - self.loss_weight_wh = 0.8 - self.loss_weight_conf_target = 1.5 - self.loss_weight_conf_notarget = 0.5 - self.loss_weight_class = 1.2 + self.downsample = 32 + self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5) + self.gtbox_shape = (3, 5, 4) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py index 143d187edc3a154418f9e639b7d492c8ce994d42..905b7d6fe75ab0080e3e97fbd4710ad913a05a38 100644 --- a/python/paddle/fluid/tests/unittests/transformer_model.py +++ b/python/paddle/fluid/tests/unittests/transformer_model.py @@ -17,6 +17,7 @@ from __future__ import print_function from functools import partial import numpy as np +import os import paddle.fluid as fluid import paddle.fluid.layers as layers from paddle.fluid.layers.io import open_recordio_file @@ -408,7 +409,7 @@ def transformer( trg_pad_idx, pos_pad_idx, ): file_obj = open_recordio_file( - filename='/tmp/wmt16.recordio', + filename=os.environ.get('RECORDIO_FILENAME', '/tmp/wmt16.recordio'), shapes=[ [batch_size * max_length, 1], [batch_size * max_length, 1], diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py index f33c05ed2f48c2498b98fc486d6ff7471088d77e..82d0d336e523ec48c5ceca3b92ff0963c4499123 100644 --- a/python/paddle/fluid/transpiler/details/__init__.py +++ b/python/paddle/fluid/transpiler/details/__init__.py @@ -17,3 +17,4 @@ from __future__ import print_function from .program_utils import * from .ufind import * from .checkport import * +from .vars_distributed import * diff --git a/python/paddle/fluid/transpiler/details/vars_distributed.py b/python/paddle/fluid/transpiler/details/vars_distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..05e7f6e3e706376efc8af870a780d96c45642514 --- /dev/null +++ b/python/paddle/fluid/transpiler/details/vars_distributed.py @@ -0,0 +1,269 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function +from paddle.fluid.framework import Variable + + +class VarStruct(object): + """ + record part properties of a Variable in python. + """ + + def __init__(self, name, shape, dtype, type, lod_level, persistable): + self.name = name + self.shape = shape + self.dtype = dtype + self.type = type + self.lod_level = lod_level + self.persistable = persistable + + +class VarDistributed(object): + """ + a class to record the var distributed on parameter servers. + the class will record the relationship between origin var and slice var. + the slice var's properties, such as type/shape/offset/endpoint. + """ + + def __init__(self, + origin_var, + slice_var, + is_slice=None, + block_id=None, + offset=None, + vtype=None, + endpoint=None): + """ + Args: + origin_var(Variable|VarStruct): origin var properties + slice_var(Variable|VarStruct): slice var properties + is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard. + block_id(int|None): the number about the slice var. + offset(int|None): if the slice var is sliced, offset is the numel before the var. + vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch. + endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001" + """ + + if isinstance(origin_var, Variable): + self.origin = self.__create_var_struct(origin_var) + else: + self.origin = origin_var + + if isinstance(slice_var, Variable): + self.slice = self.__create_var_struct(slice_var) + else: + self.slice = slice_var + + if self.equal(self.origin, self.slice): + self.is_slice = False + self.block_id = 0 + self.offset = 0 + else: + self.is_slice = True + self.block_id = 0 + self.offset = 0 + + if is_slice is not None: + self.is_slice = is_slice + if block_id is not None: + self.block_id = block_id + if offset is not None: + self.offset = offset + + self.vtype = vtype + self.endpoint = endpoint + + @staticmethod + def __create_var_struct(var): + return VarStruct(var.name, var.shape, var.dtype, var.type, + var.lod_level, var.persistable) + + @staticmethod + def equal(var1, var2): + """ + the two var is equal or not. + Returns: + bool: equal will return True else False + """ + assert isinstance(var1, VarStruct) and isinstance(var2, VarStruct) + + return var1.name == var2.name and \ + var1.type == var2.type and \ + var1.shape == var2.shape and \ + var1.dtype == var2.dtype and \ + var1.lod_level == var2.lod_level and \ + var1.persistable == var2.persistable + + def __str__(self): + origin_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})". \ + format(i="{", e="}", name=self.origin.name, type=self.origin.type, + shape=self.origin.shape, dtype=self.origin.dtype) + + slice_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})" \ + ".slice({is_slice}).block({block_id}).offset({offset})". \ + format(i="{", e="}", name=self.slice.name, type=self.slice.type, + shape=self.slice.shape, dtype=self.slice.dtype, + is_slice=self.is_slice, block_id=self.block_id, offset=self.offset) + + return "var owned: {}, origin var: ( {} ), slice var: ( {} ), endpoint: {} ".format( + self.vtype, origin_var_str, slice_var_str, self.endpoint) + + +class VarsDistributed(object): + """ + a gather about VarDistributed with many methods to find distributed vars. + through the class, we can get overview about the distributed parameters on parameter servers. + this class may centralized and convenient for developer to manage and get variable's distribute. + other module can also use this to find variables such io.py. + """ + + def __init__(self): + self.distributed_vars = [] + + def add_distributed_var(self, + origin_var, + slice_var, + is_slice=None, + block_id=None, + offset=None, + vtype=None, + endpoint=None): + """ + add distributed var in this. + + Args: + origin_var(Variable|VarStruct): origin var properties + slice_var(Variable|VarStruct): slice var properties + is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard. + block_id(int|None): the number about the slice var. + offset(int|None): if the slice var is sliced, offset is the numel before the var. + vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch. + endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001" + Returns: + None + """ + self.distributed_vars.append( + VarDistributed(origin_var, slice_var, is_slice, block_id, offset, + vtype, endpoint)) + + def get_distributed_var_by_slice(self, var_name): + """ + get distributed var by conditions. + + Args: + var_name(str): slice var name, such as "w.traier0.block1" + Returns: + VarDistributed: distributed var. + """ + for dist_var in self.distributed_vars: + if dist_var.slice.name == var_name: + return dist_var + return None + + @staticmethod + def equal(var1, var2): + """ + the two var is equal or not. + Returns: + bool: equal will return True else False + """ + return var1.name == var2.name and \ + var1.type == var2.type and \ + var1.shape == var2.shape and \ + var1.dtype == var2.dtype and \ + var1.lod_level == var2.lod_level and \ + var1.persistable == var2.persistable + + def get_distributed_var_by_origin_and_ep(self, origin_var_name, endpoint): + """ + get distributed var by conditions. + + Args: + origin_var_name(str): + endpoint(str): the parameter endpoint, such as "127.0.0.1:1001" + Returns: + VarDistributed: distributed var. + """ + for dist_var in self.distributed_vars: + if dist_var.origin.name == origin_var_name and dist_var.endpoint == endpoint: + return dist_var + return None + + def get_distributed_vars_by_vtypes(self, vtypes, groupby=False): + """ + get distributed vars by conditions. + + Args: + vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch" + groupby(bool|False): group by origin var or not. + + Returns: + list: distributed var list. + dict: distributed var map when groupby=True + """ + vtype_vars = [] + for var in self.distributed_vars: + if var.vtype in vtypes: + vtype_vars.append(var) + if not groupby: + return vtype_vars + + params_map = {} + for var in vtype_vars: + origin_var_name = var.origin.name + + if origin_var_name in params_map.keys(): + optimizers = params_map.get(origin_var_name) + else: + optimizers = [] + optimizers.append(var) + params_map[origin_var_name] = optimizers + return params_map + + def get_distributed_vars_by_ep(self, endpoint, vtype=None): + """ + get distributed vars by conditions. + + Args: + endpoint(str): the parameter server endpoint, such as "127.0.0.1:2001" + vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch" + + Returns: + list: distributed var list. + """ + endpoint_vars = [] + for var in self.distributed_vars: + if var.endpoint == endpoint: + endpoint_vars.append(var) + if not vtype: + return endpoint_vars + + vtype_vars = [] + for var in endpoint_vars: + if var.vtype == vtype: + vtype_vars.append(var) + return vtype_vars + + def overview(self): + """ + get the overview string about all params on all parameter servers. + + Returns: + Str: overview string. + + """ + vars_str = [] + for var in self.distributed_vars: + vars_str.append(str(var)) + return "\n".join(vars_str) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index e58f34e3750803669149685003ea5858fa775ed7..a3293afbbd7cef8470c808e98ae88a05f2e492f4 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -30,19 +30,23 @@ Steps to transpile pserver: 5. add listen_and_serv op """ +import sys import math -import numpy as np +from functools import reduce + import collections +import six import logging +import numpy as np + from .ps_dispatcher import RoundRobin, PSDispatcher from .. import core, framework, unique_name from ..framework import Program, default_main_program, \ - default_startup_program, Block, \ - Parameter, Variable, grad_var_name -from .details import * + default_startup_program, Block, Parameter, grad_var_name +from .details import wait_server_ready, UnionFind, VarStruct, VarsDistributed +from .details import delete_ops, find_op_by_output_arg from ..distribute_lookup_table import find_distributed_lookup_table -from functools import reduce LOOKUP_TABLE_TYPE = "lookup_table" LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad" @@ -62,260 +66,6 @@ def log(*args): print(args) -class VarStruct(object): - """ - record part properties of a Variable in python. - """ - - def __init__(self, name, shape, dtype, type, lod_level, persistable): - self.name = name - self.shape = shape - self.dtype = dtype - self.type = type - self.lod_level = lod_level - self.persistable = persistable - - -class VarDistributed(object): - """ - a class to record the var distributed on parameter servers. - the class will record the relationship between origin var and slice var. - the slice var's properties, such as type/shape/offset/endpoint. - """ - - def __init__(self, - origin_var, - slice_var, - is_slice=None, - block_id=None, - offset=None, - vtype=None, - endpoint=None): - """ - Args: - origin_var(Variable|VarStruct): origin var properties - slice_var(Variable|VarStruct): slice var properties - is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard. - block_id(int|None): the number about the slice var. - offset(int|None): if the slice var is sliced, offset is the numel before the var. - vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch. - endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001" - """ - - if isinstance(origin_var, Variable): - self.origin = self.__create_var_struct(origin_var) - else: - self.origin = origin_var - - if isinstance(slice_var, Variable): - self.slice = self.__create_var_struct(slice_var) - else: - self.slice = slice_var - - if self.equal(self.origin, self.slice): - self.is_slice = False - self.block_id = 0 - self.offset = 0 - else: - self.is_slice = True - self.block_id = 0 - self.offset = 0 - - if is_slice is not None: - self.is_slice = is_slice - if block_id is not None: - self.block_id = block_id - if offset is not None: - self.offset = offset - - self.vtype = vtype - self.endpoint = endpoint - - @staticmethod - def __create_var_struct(var): - return VarStruct(var.name, var.shape, var.dtype, var.type, - var.lod_level, var.persistable) - - @staticmethod - def equal(var1, var2): - """ - the two var is equal or not. - Returns: - bool: equal will return True else False - """ - assert isinstance(var1, VarStruct) and isinstance(var2, VarStruct) - - return var1.name == var2.name and \ - var1.type == var2.type and \ - var1.shape == var2.shape and \ - var1.dtype == var2.dtype and \ - var1.lod_level == var2.lod_level and \ - var1.persistable == var2.persistable - - def __str__(self): - origin_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})". \ - format(i="{", e="}", name=self.origin.name, type=self.origin.type, - shape=self.origin.shape, dtype=self.origin.dtype) - - slice_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})" \ - ".slice({is_slice}).block({block_id}).offset({offset})". \ - format(i="{", e="}", name=self.slice.name, type=self.slice.type, - shape=self.slice.shape, dtype=self.slice.dtype, - is_slice=self.is_slice, block_id=self.block_id, offset=self.offset) - - return "var owned: {}, origin var: ( {} ), slice var: ( {} ), endpoint: {} ".format( - self.vtype, origin_var_str, slice_var_str, self.endpoint) - - -class VarsDistributed(object): - """ - a gather about VarDistributed with many methods to find distributed vars. - through the class, we can get overview about the distributed parameters on parameter servers. - this class may centralized and convenient for developer to manage and get variable's distribute. - other module can also use this to find variables such io.py. - """ - - def __init__(self): - self.distributed_vars = [] - - def add_distributed_var(self, - origin_var, - slice_var, - is_slice=None, - block_id=None, - offset=None, - vtype=None, - endpoint=None): - """ - add distributed var in this. - - Args: - origin_var(Variable|VarStruct): origin var properties - slice_var(Variable|VarStruct): slice var properties - is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard. - block_id(int|None): the number about the slice var. - offset(int|None): if the slice var is sliced, offset is the numel before the var. - vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch. - endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001" - Returns: - None - """ - self.distributed_vars.append( - VarDistributed(origin_var, slice_var, is_slice, block_id, offset, - vtype, endpoint)) - - def get_distributed_var_by_slice(self, var_name): - """ - get distributed var by conditions. - - Args: - var_name(str): slice var name, such as "w.traier0.block1" - Returns: - VarDistributed: distributed var. - """ - for dist_var in self.distributed_vars: - if dist_var.slice.name == var_name: - return dist_var - return None - - @staticmethod - def equal(var1, var2): - """ - the two var is equal or not. - Returns: - bool: equal will return True else False - """ - return var1.name == var2.name and \ - var1.type == var2.type and \ - var1.shape == var2.shape and \ - var1.dtype == var2.dtype and \ - var1.lod_level == var2.lod_level and \ - var1.persistable == var2.persistable - - def get_distributed_var_by_origin_and_ep(self, origin_var_name, endpoint): - """ - get distributed var by conditions. - - Args: - origin_var_name(str): - endpoint(str): the parameter endpoint, such as "127.0.0.1:1001" - Returns: - VarDistributed: distributed var. - """ - for dist_var in self.distributed_vars: - if dist_var.origin.name == origin_var_name and dist_var.endpoint == endpoint: - return dist_var - return None - - def get_distributed_vars_by_vtypes(self, vtypes, groupby=False): - """ - get distributed vars by conditions. - - Args: - vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch" - groupby(bool|False): group by origin var or not. - - Returns: - list: distributed var list. - dict: distributed var map when groupby=True - """ - vtype_vars = [] - for var in self.distributed_vars: - if var.vtype in vtypes: - vtype_vars.append(var) - if not groupby: - return vtype_vars - - params_map = {} - for var in vtype_vars: - origin_var_name = var.origin.name - - if origin_var_name in params_map.keys(): - optimizers = params_map.get(origin_var_name) - else: - optimizers = [] - optimizers.append(var) - params_map[origin_var_name] = optimizers - return params_map - - def get_distributed_vars_by_ep(self, endpoint, vtype=None): - """ - get distributed vars by conditions. - - Args: - endpoint(str): the parameter server endpoint, such as "127.0.0.1:2001" - vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch" - - Returns: - list: distributed var list. - """ - endpoint_vars = [] - for var in self.distributed_vars: - if var.endpoint == endpoint: - endpoint_vars.append(var) - if not vtype: - return endpoint_vars - - vtype_vars = [] - for var in endpoint_vars: - if var.vtype == vtype: - vtype_vars.append(var) - return vtype_vars - - def overview(self): - """ - get the overview string about all params on all parameter servers. - - Returns: - Str: overview string. - - """ - vars_str = [] - for var in self.distributed_vars: - vars_str.append(str(var)) - return "\n".join(vars_str) - - class VarBlock: def __init__(self, varname, offset, size): self.varname = varname diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index e5d48d3d19ed71624d528144f13e23770a09362a..52c1aea288fa2bb7478ad14186367900c05f64e7 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -540,6 +540,7 @@ def memory_optimize(input_program, if skip_opt_set is not None: skip_opt_set = set(map(to_name_str, skip_opt_set)) cfgs = _get_cfgs(input_program) + input_program._is_mem_optimized = True for cfg in cfgs: cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level) @@ -559,5 +560,6 @@ def release_memory(input_program, skip_opt_set=None): None """ cfgs = _get_cfgs(input_program) + input_program._is_mem_optimized = True for cfg in cfgs: cfg.release_memory(skip_opt_set=skip_opt_set) diff --git a/python/setup.py.in b/python/setup.py.in index c947785cbf7517be56c3e43120db65284ab22d10..f93f0cd130e33311bade2b15726c3eff37546214 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -109,6 +109,7 @@ packages=['paddle', 'paddle.fluid.contrib', 'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.quantize', + 'paddle.fluid.contrib.int8_inference', 'paddle.fluid.contrib.reader', 'paddle.fluid.contrib.slim', 'paddle.fluid.contrib.slim.core',