diff --git a/CMakeLists.txt b/CMakeLists.txt index e4442d254901e2524385452ebe5ac6f6df3056f9..61f5e63098c40f140774ba6bfd9a2de8d2d67bfb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,12 +25,18 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") if(WIN32) + set(CMAKE_SUPPRESS_REGENERATION ON) set(CMAKE_STATIC_LIBRARY_PREFIX lib) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838) + set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221") + set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") endif(WIN32) find_package(CUDA QUIET) diff --git a/README.md b/README.md index 32a302cc5431a62b310d4812b545bd929f090e0a..68421cf177f4cd15f8f44e8d00a27cafb5a13b91 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # PaddlePaddle +English | [简体中文](./README_cn.md) [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) @@ -7,7 +8,6 @@ [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) - Welcome to the PaddlePaddle GitHub. PaddlePaddle (PArallel Distributed Deep LEarning) is an easy-to-use, @@ -18,16 +18,6 @@ learning to many products at Baidu. Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. - -欢迎来到 PaddlePaddle GitHub - -PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台,最初由百度科学家和工程师共同开发,目的是将深度学习技术应用到百度的众多产品中。 - -我们的愿景是让每个人都能通过PaddlePaddle接触深度学习 - -跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) - - ### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) ### Install Latest Stable Release: ``` @@ -43,23 +33,6 @@ pip install paddlepaddle-gpu==1.2.0.post85 # For installation on other platform, refer to http://paddlepaddle.org/ ``` - -### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) -### 安装最新稳定版本: -``` -# Linux CPU -pip install paddlepaddle -# Linux GPU cuda9cudnn7 -pip install paddlepaddle-gpu -# Linux GPU cuda8cudnn7 -pip install paddlepaddle-gpu==1.2.0.post87 -# Linux GPU cuda8cudnn5 -pip install paddlepaddle-gpu==1.2.0.post85 - -# 其他平台上的安装指引请参考 http://paddlepaddle.org/ -``` - - ## Features - **Flexibility** @@ -100,38 +73,10 @@ pip install paddlepaddle-gpu==1.2.0.post85 Baidu and it has achieved a significant impact. We hope you can also explore the capability of PaddlePaddle to make an impact on your product. -## 特点 - -- **灵活性** - - PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型,例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。 - -- **高效性** - - 为了高效使用异步计算资源,PaddlePaddle对框架的不同层进行优化,包括计算、存储、架构和通信。下面是一些样例: - - - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。 - - 通过MKL-DNN库优化CNN网络 - - 高度优化循环网络,无需执行 `padding` 操作即可处理 **变长** 序列 - - 针对高维稀疏数据模型,优化了局部和分布式训练。 - - -- **稳定性** - - 有了 PaddlePaddle,使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。 - -- **连接产品** - - 另外,PaddlePaddle 的设计也易于部署。在百度,PaddlePaddle 已经部署到含有巨大用户量的产品和服务上,包括广告点击率(CTR)预测、大规模图像分类、光学字符识别(OCR)、搜索排序,计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中,产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力,为您的产品创造新的影响力和效果。 - ## Installation It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website. -## 安装 - -推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) - ## Documentation We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and @@ -153,37 +98,9 @@ We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarte We appreciate your contributions! -## 文档 - -我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和 -[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档 - -- [深度学习101](https://github.com/PaddlePaddle/book) - - 或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行 - -- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) - - 可以在MPI集群上运行分布式训练任务 - -- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) - - 新的API支持代码更少更简洁的程序 - -- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) - - 欢迎您的贡献! - ## Ask Questions You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues). -## 答疑 - -欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交 - ## Copyright and License PaddlePaddle is provided under the [Apache-2.0 license](LICENSE). - -## 版权和许可证 -PaddlePaddle由[Apache-2.0 license](LICENSE)提供 diff --git a/README_cn.md b/README_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..dfb55b17ca4fd05ce5b7b85b2e26e4f7f7229763 --- /dev/null +++ b/README_cn.md @@ -0,0 +1,88 @@ +# PaddlePaddle + +[English](./README.md) | 简体中文 + +[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) +[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) +[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) + +欢迎来到 PaddlePaddle GitHub + +PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台,最初由百度科学家和工程师共同开发,目的是将深度学习技术应用到百度的众多产品中。 + +我们的愿景是让每个人都能通过PaddlePaddle接触深度学习 + +跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) + +### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) +### 安装最新稳定版本: +``` +# Linux CPU +pip install paddlepaddle +# Linux GPU cuda9cudnn7 +pip install paddlepaddle-gpu +# Linux GPU cuda8cudnn7 +pip install paddlepaddle-gpu==1.2.0.post87 +# Linux GPU cuda8cudnn5 +pip install paddlepaddle-gpu==1.2.0.post85 + +# 其他平台上的安装指引请参考 http://paddlepaddle.org/ +``` + +## 特性 + +- **灵活性** + + PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型,例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。 + +- **高效性** + + 为了高效使用异步计算资源,PaddlePaddle对框架的不同层进行优化,包括计算、存储、架构和通信。下面是一些样例: + + - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。 + - 通过MKL-DNN库优化CNN网络 + - 高度优化循环网络,无需执行 `padding` 操作即可处理 **变长** 序列 + - 针对高维稀疏数据模型,优化了局部和分布式训练。 + + +- **稳定性** + + 有了 PaddlePaddle,使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。 + +- **与产品相连** + + 另外,PaddlePaddle 的设计也易于部署。在百度,PaddlePaddle 已经部署到含有巨大用户量的产品和服务上,包括广告点击率(CTR)预测、大规模图像分类、光学字符识别(OCR)、搜索排序,计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中,产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力,为您的产品创造新的影响力和效果。 + +## 安装 + +推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) + +## 文档 + +我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和 +[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档 + +- [深度学习101](https://github.com/PaddlePaddle/book) + + 或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行 + +- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) + + 可以在MPI集群上运行分布式训练任务 + +- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) + + 新的API支持代码更少更简洁的程序 + +- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) + + 欢迎您的贡献! + +## 答疑 + +欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交 + +## 版权和许可证 +PaddlePaddle由[Apache-2.0 license](LICENSE)提供 diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 076e839120d98d801de4374f2f8338ebd918b88f..b0f54bf49aafb65f1a92fa95877de2cc61fc67d3 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -152,7 +152,12 @@ endif() if (WITH_MKLML AND MKLML_IOMP_LIB) message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") - set(OPENMP_FLAGS "-fopenmp") + if(WIN32) + # openmp not support well for now on windows + set(OPENMP_FLAGS "") + else(WIN32) + set(OPENMP_FLAGS "-fopenmp") + endif(WIN32) set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index ea46f6418edf1db70b2a308dd49cf2131cc89d3b..ef4192ecc98ea6de0c81c1f33320528d547b818a 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -203,25 +203,26 @@ list(APPEND CUDA_NVCC_FLAGS "-w") list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") if (NOT WIN32) -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) -elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) -elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) -elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") - # nvcc 9 does not support -Os. Use Release flags instead - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) -endif() + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) + elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) + elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) + elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") + # nvcc 9 does not support -Os. Use Release flags instead + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) + endif() else(NOT WIN32) -list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS "-g -G") - # match the cl's _ITERATOR_DEBUG_LEVEL - list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG") -elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") -else() + list(APPEND CUDA_NVCC_FLAGS "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"") + list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS "-g -G") + # match the cl's _ITERATOR_DEBUG_LEVEL + list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG") + elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") + else() message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") endif() endif(NOT WIN32) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 7a6a4523886824a67c82f9ce978de025ddb9c2cd..d3a4d69d3a05515fdf72074083470e19b4ec255c 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -20,8 +20,10 @@ SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include dire IF(WIN32) SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530") ELSE(WIN32) SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) ENDIF(WIN32) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) @@ -39,7 +41,7 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 6a7be73f09a278ab0fd29c7599a7781df3d29413..92fe76d05c7507c295b784bc37870abfc31a0a29 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -49,6 +49,8 @@ IF(NOT WIN32) SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") +ELSE() + SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc") ENDIF(NOT WIN32) ExternalProject_Add( @@ -61,7 +63,6 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake index 27d075336d556528ffaf1929c34753494692f0a0..1e01057aa606af78cd722d3619a710cb35817174 100644 --- a/cmake/external/snappy.cmake +++ b/cmake/external/snappy.cmake @@ -20,6 +20,12 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) +if(WIN32) + SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267") +else() + SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) +endif() + ExternalProject_Add( extern_snappy GIT_REPOSITORY "https://github.com/google/snappy" @@ -31,7 +37,7 @@ ExternalProject_Add( -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 9e6c47f016fe6dfd809c5b2bc88ff59d0a6b2b84..81e7868a6ad3fee16911a49ff9d1394a103706c5 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -147,12 +147,6 @@ set(GPU_COMMON_FLAGS -Wno-error=unused-function # Warnings in Numpy Header. -Wno-error=array-bounds # Warnings in Eigen::array ) - -else(NOT WIN32) -set(COMMON_FLAGS - "/w") #disable all warnings. -set(GPU_COMMON_FLAGS - "/w") #disable all warnings endif(NOT WIN32) if (APPLE) @@ -193,8 +187,7 @@ safe_set_static_flag() CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) - if(${flag_var} MATCHES "/W3") - string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "/W3") + string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}") + set(flag_var "${flag_var} /w") endforeach(flag_var) endif(WIN32) diff --git a/cmake/version.cmake b/cmake/version.cmake index ac10bdf067be549fe90112aef73fd6e1fbe0ac48..dd57d4ab9969ce530f93ca1694350b1a26b5b543 100644 --- a/cmake/version.cmake +++ b/cmake/version.cmake @@ -31,8 +31,23 @@ while ("${PADDLE_VERSION}" STREQUAL "") set(tmp_version "${GIT_TAG_NAME}~1") endif() else() - # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest - set(PADDLE_VERSION "0.0.0") + execute_process( + COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version} + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} + OUTPUT_VARIABLE GIT_EXACT_TAG_NAME + RESULT_VARIABLE GIT_EXACT_TAG_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + if (NOT ${GIT_EXACT_TAG_NAME}) + # Check if current branch is tag branch + if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}") + string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME}) + else() + set(PADDLE_VERSION "0.0.0") + endif() + else() + # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest + set(PADDLE_VERSION "0.0.0") + endif() endif() else() set(PADDLE_VERSION "0.0.0") diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 84b4677777a79b30ba8936025a60e8d6d9186a2c..f50a38842a21c795c979f859e88a9b16c3e54bd8 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -325,6 +325,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.box_clip ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.multiclass_nms ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 66f11dedbaccd7febcd75fa7ade9c68b6c42022c..910318a49cea50fadd29b1427a4591abfa5d5a23 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -128,7 +128,7 @@ cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) -cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) +cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc memory_optimize_helper) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) @@ -192,6 +192,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry proto_desc) +cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper) cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index d5966ad5a97a97ec40c8a01d2d2c8ed5d7f90421..6621a59d37a670f7025507faeab5b9897794a72e 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -50,7 +50,9 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) -cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc DEPS graph graph_helper pass) +cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper) +cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) +cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) @@ -65,12 +67,12 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass) +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass inplace_op_pass) if (WITH_GPU) list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) endif() -cc_test(memory_reuse_types_test SRCS memory_reuse_types_test.cc memory_reuse_types.cc DEPS framework_proto graph) -cc_test(analysis_var_pass_test SRCS analysis_var_pass_test.cc analysis_var_pass.cc memory_reuse_types.cc DEPS framework_proto graph graph_helper op_registry pass) +cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph) +cc_test(memory_optimize_pass_test SRCS memory_optimize_pass_test.cc memory_optimize_pass.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry pass) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index ce5731a1f414e8ef6d8af22a3bb17109e82beb87..51ce9732722efa44d2489f5b77694094e58c8775 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/reduce_op_handle.h" @@ -47,6 +47,22 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { AppendPass("sequential_execution_pass"); } + // Add op fusion. + if (strategy.fuse_relu_depthwise_conv_) { + AppendPass("fuse_relu_depthwise_conv_pass"); + } + + // NOTE(dzhwinter): A note for automatical inplace. + // 1. modify program desc passes should put + // before inplace pass. + // 2. manually configured inplace should put + // before inplace_pass + + // Add automatically inplace. + if (strategy_.enable_inplace_) { + AppendPass("inplace_pass"); + } + // Add a graph viz pass to record a graph. if (!strategy_.debug_graphviz_path_.empty()) { auto viz_pass = AppendPass("graph_viz_pass"); @@ -55,10 +71,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { viz_pass->Set("graph_viz_path", new std::string(graph_path)); } - // Add op fusion. - if (strategy.fuse_relu_depthwise_conv_) { - AppendPass("fuse_relu_depthwise_conv_pass"); - } if (strategy.fuse_elewise_add_act_ops_) { auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass"); // Add a graph viz pass to record a graph. @@ -88,7 +100,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // A side-effect of that, memory optimize cannot forsee the fetched vars // , so fetchlist should be set persistable before call the Run interface. if (strategy.memory_optimize_) { - auto analysis_var_pass = AppendPass("analysis_var_pass"); + auto memory_optimize_pass = AppendPass("memory_optimize_pass"); } AppendMultiDevPass(strategy); @@ -186,8 +198,10 @@ std::unique_ptr BuildStrategy::Apply( pass->Erase("nccl_ctxs"); pass->SetNotOwned("nccl_ctxs", nctx); #endif - - } else if (pass->Type() == "analysis_var_pass") { + } else if (pass->Type() == "memory_optimize_pass") { + if (graph->Has(kAllOpDescs)) { + graph->Erase(kAllOpDescs); + } const std::vector *all_op_descs = new std::vector(main_program.Block(0).AllOps()); graph->Set>(kAllOpDescs, @@ -214,6 +228,13 @@ std::unique_ptr BuildStrategy::Apply( pass->Set>( kAllOpDescs, new std::vector(main_program.Block(0).AllOps())); + } else if (pass->Type() == "inplace_pass") { + if (graph->Has(kAllOpDescs)) { + graph->Erase(kAllOpDescs); + } + graph->Set>( + kAllOpDescs, + new std::vector(main_program.Block(0).AllOps())); } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") { if (!use_cuda) { LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on " @@ -239,9 +260,10 @@ USE_PASS(allreduce_mode_multi_devices_pass); USE_PASS(dist_multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); -USE_PASS(analysis_var_pass); +USE_PASS(memory_optimize_pass); USE_PASS(sequential_execution_pass); USE_PASS(all_reduce_deps_pass); USE_PASS(modify_op_lock_and_record_event_pass); +USE_PASS(inplace_pass); USE_PASS(lock_free_optimize_pass); USE_PASS(graph_to_program_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index cd24a3175953bf323748bf0c7e3159761c13f0a9..e3e06a5614ddee0bea342bc3608691b7a32326cc 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -80,6 +80,11 @@ struct BuildStrategy { bool memory_early_delete_{false}; + // TODO(dzhwinter): + // make enable_inplace, memory_optimize_ + // memory_early_delete_ true by default + bool enable_inplace_{false}; + bool enable_sequential_execution_{false}; bool fuse_broadcast_op_{false}; diff --git a/paddle/fluid/framework/details/graph_test_base.h b/paddle/fluid/framework/details/graph_test_base.h new file mode 100644 index 0000000000000000000000000000000000000000..126959bcd80a4677f76b7cff677a82a319f7cfb3 --- /dev/null +++ b/paddle/fluid/framework/details/graph_test_base.h @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +class DummyOp : public OperatorBase { + public: + DummyOp(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const Scope& scope, + const platform::Place& place) const override {} +}; + +class SumOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class AssignOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class SplitOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", ""); + AddOutput("Out", "").AsDuplicable(); + AddComment(""); + } +}; + +class DummyVarTypeInference : public VarTypeInference { + public: + void operator()(const OpDesc& op_desc, BlockDesc* block) const override { + auto& inputs = op_desc.Input("X"); + auto type = block->Var(inputs.front())->GetType(); + auto out_var_name = op_desc.Output("Out").front(); + block->Var(out_var_name)->SetType(type); + } +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..78c5d5b50e606daa963e728355dc1bce83cd5484 --- /dev/null +++ b/paddle/fluid/framework/details/inplace_op_pass.cc @@ -0,0 +1,433 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/inplace_op_pass.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/memory_optimize_pass.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_info.h" + +// NOTE(dzhwinter): inplace means one op output variable reuse the input space. +// By our design, one operator only can read its input(const Variable), +// write its output(non-const Variable). If one operator is inplaced, means +// user have chance to write the space before reading happens. +// Especially when some optimize code writing style is applied. +// +// +// /* wrong case in operator */ +// /*In this case, a larger allocation is allocated, input content is lost*/ +// const Tensor* in = ctx.Input("In") +// Tensor* out = ctx.Output("Out"); +// auto* out_ptr = out->mutable_data(ctx.GetPlace()); +// out_ptr[0] = 0; // input contect is overwrited. + +// NOTE(dzhwinter): +// Only for backward compacity and stable. if enable_inplace_whitelist is turn +// on. +// only the ops in whitelist will be use inplace strategy. +// if not, all the op will be inplaced if it registered with InplaceClass +DEFINE_bool( + enable_inplace_whitelist, false, + "If this option turns on, only these op in whitelist can be inplaced." + "If it turns off, all of the running op can be candidate of inplaced op." + "Such as scale, elementwise_add" + "By default, it's turned on"); + +DECLARE_string(memory_optimize_debug); + +// clang-format off +const std::string kInplacedOpWhiteList[] = { // NOLINT + "sigmoid", + "exp", + "relu", + "tanh", + "sqrt", + "ceil", + "floor", + "reciprocal", + "relu6", + "soft_relu", + "hard_sigmoid", + "batch_norm", + "batch_norm_grad", + "sum", + "sum_grad", + "scale", + "reshape", + "elementwise_add", + "elementwise_add_grad", +}; +// clang-format on + +namespace paddle { +namespace framework { +namespace details { + +static inline ir::Node* GetNextCascadeInplacedVar(ir::Node* var) { + // if next op is inplaced, then return the output var + // otherwise return nullptr + PADDLE_ENFORCE(var && var->IsVar() && !var->IsCtrlVar()); + ir::Node* inplaced_var = nullptr; + for (auto* next_op : var->outputs) { + for (auto* output : next_op->outputs) { + if (output->IsVar() && !output->IsCtrlVar() && + output->Name() == var->Name()) { + inplaced_var = output; + } + } + } + return inplaced_var; +} + +static inline ir::Node* GetPrevCascadeInplacedVar(ir::Node* var) { + PADDLE_ENFORCE(var && var->IsVar() && !var->IsCtrlVar()); + if (var->inputs.empty()) return nullptr; + auto* prev_op = var->inputs.at(0); + auto input_it = std::find_if(prev_op->inputs.begin(), prev_op->inputs.end(), + [&](ir::Node* node) { + if (node->IsVar() && !node->IsCtrlVar() && + node->Name() == var->Name()) { + return true; + } else { + return false; + } + }); + return input_it == prev_op->inputs.end() ? nullptr : *input_it; +} + +InplacePass::InplacePass() : Pass() { + if (FLAGS_enable_inplace_whitelist) { + for (auto& s : kInplacedOpWhiteList) { + whitelist_.emplace(s); + } + } +} + +void InplacePass::InitSSAGraphNodes() const { + std::unordered_map> all_vars; + for (auto* op : view_.AllOps()) { + for (auto* node : op->inputs) { + if (!node->IsVar() || node->IsCtrlVar()) continue; + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + for (auto* node : op->outputs) { + if (!node->IsVar() || node->IsCtrlVar()) continue; + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + } +} + +std::unique_ptr InplacePass::ApplyImpl( + std::unique_ptr graph) const { + var_nodes_.clear(); + view_.Build(graph.get()); + InitSSAGraphNodes(); + + for (auto* op : view_.AllOps()) { + if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name())) + continue; + TryInplaceOpInputOutput(op, graph.get()); + } + graph->ResolveHazard(var_nodes_); + + return graph; +} + +void InplacePass::InplaceModifyDesc(const std::string& var, + const std::string& cache_var, + const size_t& idx) const { + for (size_t i = idx; i < view_.AllOps().size(); ++i) { + ir::Node* op = view_.AllOps()[i]; + PADDLE_ENFORCE(op->IsOp() && op->Op()); + auto* op_desc = op->Op(); + op_desc->RenameInput(var, cache_var); + op_desc->RenameOutput(var, cache_var); + if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var); + op_desc->Flush(); + } +} + +const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var, + const std::string& cache_var, + const size_t& idx, + ir::Graph* graph) const { + PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && + var_nodes_[var].at(0)->Var() != nullptr); + std::unique_ptr var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); + var_desc->SetName(cache_var); + + SSANodePair swap_nodes; + + for (size_t i = idx; i < view_.AllOps().size(); ++i) { + auto* op = view_.AllOps()[i]; + + // redirect the input to the latest version of cache_var + for (auto* node : op->inputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + + // swap node to cache_node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp()); + auto* prev_op = node->inputs[0]; + std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, + cache_node); + cache_node->inputs.emplace_back(prev_op); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + + swap_nodes.emplace_back(std::make_pair(node, cache_node)); + } + } + + // if we need to rename the output, + // always create a newer version of cache_var + for (auto* node : op->outputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + // swap node to cache node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + cache_node->inputs.emplace_back(op); + std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + + swap_nodes.emplace_back(std::make_pair(node, cache_node)); + } + } + } + + return swap_nodes; +} + +void InplacePass::CommitModify(const SSANodePair& swap_nodes, + ir::Graph* graph) const { + for (auto& pair : swap_nodes) { + auto *node = pair.first, *cache_node = pair.second; + const std::string var = node->Name(), cache_var = cache_node->Name(); + var_nodes_[cache_var].emplace_back(cache_node); + graph->RemoveNode(node); + auto& nodes = var_nodes_.at(var); + // release unused var in graph. Because python side memory optimize + // may reused the var in same name, so we only clear the var node + // after current inplaced index. + nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end()); + } +} + +void InplacePass::WithdrawModify(const SSANodePair& nodes, + ir::Graph* graph) const { + for (auto& pair : nodes) { + auto *node = pair.first, *cache_node = pair.second; + const std::string var = node->Name(), cache_var = cache_node->Name(); + auto* prev_op = node->inputs[0]; + std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), cache_node, + node); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), cache_node, + node); + } + graph->RemoveNode(cache_node); + } +} + +void InplacePass::TryInplaceOpInputOutput(ir::Node* op, + ir::Graph* graph) const { + VLOG(4) << "Try to inplace op " << op->Name(); + PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr, + "op_desc is nullptr"); + // some pre-requirments need to meet if the op want to inplaced. + + auto* op_desc = op->Op(); + auto& infer_inplace = + OpInfoMap::Instance().Get(op_desc->Type()).infer_inplace_; + + // 1. infer_inplace_ is registered. + if (!static_cast(infer_inplace)) return; + PADDLE_ENFORCE(static_cast(infer_inplace), + "%s's infer_inplace has not been registered", op_desc->Type()); + + auto* block = op_desc->Block(); + auto in_to_outs = infer_inplace(*op_desc, block); + + auto& all_ops = view_.AllOps(); + auto cursor = std::find(all_ops.begin(), all_ops.end(), op); + size_t idx = std::distance(all_ops.begin(), cursor); + + for (auto& pair : in_to_outs) { + auto& in_var_name = pair.first; + auto& out_var_name = pair.second; + auto* in_node = view_.GetNodeByName(in_var_name, op->inputs); + auto* out_node = view_.GetNodeByName(out_var_name, op->outputs); + + // 2. there is no external pending op on the input node + if (view_.PendingOpsOnVar(in_node).size() > 1) { + VLOG(4) << string::Sprintf( + "Skiped pair %s => %s. %s input has external dependency." + "inplace such pair will overwrite the memory.", + out_var_name, in_var_name, op->Name()); + continue; + } + + // 3. if output has been memory optimize by python(fluid.memory_optmize()). + // this candidate can not be inplaced. Will be deprecated in the future. + if (view_.InSkipSet(out_node->Name())) { + VLOG(4) << string::Sprintf( + "Skiped %s => %s reused previous memory block in python memory " + "optmize," + "it inplace may generate a circle", + out_var_name, in_var_name, op->Name()); + continue; + } + + // Debug Interface. Which would be skipped by the pass. + if (out_node->Name() == FLAGS_memory_optimize_debug) { + VLOG(3) << "Skiped var by force. FLAGS_memory_optimize_debug=" + << out_node->Name(); + continue; + } + + // NOTE(dzhwinter): + // two stage commit of inplaced process. if after inplace happens generate a + // circle, + // then withdraw the changes. Otherwise, safely add the node. + auto swap_nodes = + TryInplaceModifyVar(out_var_name, in_var_name, idx, graph); + + if (!ir::HasCircle(*graph)) { + VLOG(3) << string::Sprintf("!!! %s, %s => %s inplaced", op->Name(), + out_var_name, in_var_name); + InplaceModifyDesc(out_var_name, in_var_name, idx); + CommitModify(swap_nodes, graph); + } else { + VLOG(3) << string::Sprintf( + "Skiped pair %s => %s, inplace will generate a circle. withdraw %s", + out_var_name, in_var_name, op->Name()); + WithdrawModify(swap_nodes, graph); + } + } +} + +ir::Node* GraphView::GetNodeByName(const std::string& name, + const std::vector& nodes) const { + // nodes should be op->inputs/outputs + // node in same node do have different name. + std::unordered_set nodes_in_op; + bool has_dup_node = + std::all_of(nodes.begin(), nodes.end(), [&nodes_in_op](ir::Node* node) { + if (!node->IsVar() || node->IsCtrlVar() || node->Var() == nullptr) { + if (nodes_in_op.count(node->Name())) return true; + nodes_in_op.emplace(node->Name()); + } + return false; + }); + PADDLE_ENFORCE(has_dup_node == false, "nodes has same name!"); + ir::Node* node = nullptr; + for (auto* it : nodes) { + if (!it->IsVar() || it->IsCtrlVar() || it->Var() == nullptr) continue; + if (it->Name() == name) { + node = it; + break; + } + } + PADDLE_ENFORCE(node != nullptr, + string::Sprintf("Not found var %s in nodes!", name)); + return node; +} + +std::vector GraphView::PendingOpsOnVar(ir::Node* node) { + // get the pending ops depends on same var node. + // because node also maybe a inplaced variable, so need to backtrack all the + // previous inplaced vars. + std::vector pending_ops; + ir::Node* p = node; + while (p != nullptr) { + pending_ops.insert(pending_ops.end(), p->outputs.begin(), p->outputs.end()); + p = GetPrevCascadeInplacedVar(p); + } + return pending_ops; +} + +void GraphView::Build(ir::Graph* g) { + // track the var nodes in correct order. + // Because we insert some new created node. Which may have data race between + // nodes. + // resolve data harzards depends on the var nodes in right order. + ops_ = SortOpLikeDescOrder(*g); + + // 1. track the nodes which reused previous node in Python memory optimize. + // these node can not be inplaced, otherwise may generate a circle in graph. + std::unordered_set all_vars; + for (auto& node : g->Nodes()) { + if (node->IsVar()) continue; + for (auto& out : node->outputs) { + if (out->IsCtrlVar() || out->Var() == nullptr) continue; + if (all_vars.count(out->Name())) { + dup_nodes_.emplace(out->Name()); + } else { + all_vars.emplace(out->Name()); + } + } + } + + // 2. track the nodes which used by parameter server. + // these node can not be inplaced, otherwise trainer + // pserver can not find each other name. + auto update_skip_set = [&](ir::Node* node) { + for (auto& in : node->inputs) { + if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name()); + } + for (auto& out : node->outputs) { + if (out->IsVar() && out->Var() != nullptr) + dup_nodes_.emplace(out->Name()); + } + }; + for (auto& node : g->Nodes()) { + if (!node->IsOp()) continue; + if (node->Name() == "send") update_skip_set(node); + if (node->Name() == "recv") update_skip_set(node); + if (node->Name() == "prefetch") update_skip_set(node); + } +} + +const std::vector& GraphView::AllOps() { return ops_; } + +bool GraphView::InSkipSet(const std::string& var) const { + return dup_nodes_.count(var); +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(inplace_pass, paddle::framework::details::InplacePass); diff --git a/paddle/fluid/framework/details/inplace_op_pass.h b/paddle/fluid/framework/details/inplace_op_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..1abcf1f279e225839d440ff9c6840ce9b8a6547f --- /dev/null +++ b/paddle/fluid/framework/details/inplace_op_pass.h @@ -0,0 +1,93 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may abtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/memory_optimize_helper.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { + +class GraphView { + public: + GraphView() = default; + + void Build(ir::Graph* g); + + const std::vector& AllOps(); + + ir::Node* GetNodeByName(const std::string& name, + const std::vector& nodes) const; + + std::vector PendingOpsOnVar(ir::Node* var); + + // Will Deperated in the future. + // NOTE(dzhwinter) : + // 1. Python memory optimize will reuse + // memory based var name, so different op output may + // have the same variable name. enable inplace on such node + // will generate a circle in ssa graph. + // 2. DistributeTranspiler will use unique name to + // map the parameter and gradient, must be skipped. + bool InSkipSet(const std::string& var) const; + + private: + std::vector ops_; + std::unordered_set dup_nodes_; // mem opt affect nodes + std::map> adj_list_; +}; + +typedef std::vector> SSANodePair; +class InplacePass : public ir::Pass { + public: + InplacePass(); + + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; + + void InitSSAGraphNodes() const; + + private: + const SSANodePair TryInplaceModifyVar(const std::string& var, + const std::string& cache_var, + const size_t& idx, + ir::Graph* graph) const; + + void CommitModify(const SSANodePair&, ir::Graph* graph) const; + + void WithdrawModify(const SSANodePair& nodes, ir::Graph* graph) const; + + void InplaceModifyDesc(const std::string& in_var, const std::string& out_var, + const size_t& idx) const; + + void TryInplaceOpInputOutput(ir::Node* op, ir::Graph* graph) const; + + mutable std::map> var_nodes_; + + mutable std::unordered_set whitelist_; + mutable GraphView view_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.cc b/paddle/fluid/framework/details/memory_early_delete_pass.cc index 5906b7d57ce122520a4594f1528e00982eaa1a7f..69f8f705484450b0544291b19027eb174d7eeb8f 100644 --- a/paddle/fluid/framework/details/memory_early_delete_pass.cc +++ b/paddle/fluid/framework/details/memory_early_delete_pass.cc @@ -16,7 +16,7 @@ #include #include #include -#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" diff --git a/paddle/fluid/framework/details/memory_reuse_types.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc similarity index 66% rename from paddle/fluid/framework/details/memory_reuse_types.cc rename to paddle/fluid/framework/details/memory_optimize_helper.cc index 2b9ff518b9adcd366cc877998400a8bdc05fa033..b56ef021ef508a43aac082acbcfa6f543635203e 100644 --- a/paddle/fluid/framework/details/memory_reuse_types.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper.cc @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" +#include #include +#include #include #include @@ -21,15 +23,17 @@ namespace paddle { namespace framework { namespace details { +size_t NodeSizeInBytes(const VarDesc& node) { + auto shape = node.GetShape(); + int size = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); + size_t type_size = SizeOfType(node.GetDataType()); + return type_size * std::abs(size); +} + size_t NodeSizeInBytes(ir::Node* n) { auto* desc = FindVarDescInBlock(n); - auto shape = desc->GetShape(); - size_t type_size = SizeOfType(desc->GetDataType()); - int size = 1; - for (auto& s : shape) { - size *= s; - } - return type_size * std::abs(size); + return NodeSizeInBytes(*desc); } std::string DebugStringImpl(VarDesc* var) { @@ -83,7 +87,7 @@ struct NodeComparator { } }; -void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) { +void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); PADDLE_ENFORCE(op->IsOp()); if (mark_table_.count(var->Name()) != 0) { @@ -119,11 +123,11 @@ void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) { mark_table_[var->Name()] = it; } -int OrderedNodePairPool::GetIndex(ir::Node* var) { +int OrderedNodeList::GetIndex(ir::Node* var) { return std::distance(nodes_.begin(), mark_table_[var->Name()]); } -ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const { +ir::Node* OrderedNodeList::NodeMatch(ir::Node* var) const { ir::Node* found_node = nullptr; NodeComparator compare_node; @@ -136,13 +140,15 @@ ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const { return found_node; } -void OrderedNodePairPool::Erase(ir::Node* var) { - PADDLE_ENFORCE(mark_table_.count(var->Name())); - nodes_.erase(mark_table_[var->Name()]); - mark_table_.erase(var->Name()); +void OrderedNodeList::Erase(ir::Node* var) { Erase(var->Name()); } + +void OrderedNodeList::Erase(const std::string& var) { + PADDLE_ENFORCE(mark_table_.count(var)); + nodes_.erase(mark_table_[var]); + mark_table_.erase(var); } -std::string OrderedNodePairPool::ToString() const { +std::string OrderedNodeList::ToString() const { std::stringstream ss; for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { ss << DebugString(it->first) << " "; @@ -150,6 +156,43 @@ std::string OrderedNodePairPool::ToString() const { return ss.str(); } +bool NodeCanReused(ir::Node* node) { + if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false; + // auto* desc = node->Var(); + bool flag = NodeCanReused(*node->Var()); + for (auto* op : node->inputs) { + if (op->Op()->HasAttr("force_cpu")) { + // op output force generated in cpu, can not be reused. + flag &= framework::AttrReader(op->Op()->GetAttrMap()) + .Get("force_cpu") == 0; + } + } + return flag; +} + +bool NodeCanReused(const VarDesc& node) { + auto type = node.GetType(); + if (node.Persistable() || type != proto::VarType::LOD_TENSOR || + node.GetShape().empty()) { + return false; + } + // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad + std::string name = node.Name(); + if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') + return false; + return true; +} + +bool OpHasSubBlock(OpDesc* desc) { + const AttributeMap& attrs = desc->GetAttrMap(); + for (auto& attr : attrs) { + if (attr.second.type() == typeid(BlockDesc*) || // NOLINT + attr.second.type() == typeid(std::vector)) // NOLINT + return true; + } + return false; +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types.h b/paddle/fluid/framework/details/memory_optimize_helper.h similarity index 69% rename from paddle/fluid/framework/details/memory_reuse_types.h rename to paddle/fluid/framework/details/memory_optimize_helper.h index 9a9c1d948e869016717fea9ff6b8236adfc29845..064183d61ea7386b6b45034c90fd7569a8647f60 100644 --- a/paddle/fluid/framework/details/memory_reuse_types.h +++ b/paddle/fluid/framework/details/memory_optimize_helper.h @@ -43,7 +43,7 @@ using GraphNodePool = std::vector< // For example, // node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. // O(1) insert, delete -class OrderedNodePairPool { +class OrderedNodeList { public: using NodePair = std::pair>; using Iter = typename std::list::iterator; @@ -53,8 +53,12 @@ class OrderedNodePairPool { void Erase(ir::Node* var); + void Erase(const std::string& var); + bool Has(ir::Node* var) { return mark_table_.count(var->Name()); } + bool Has(const std::string& var) { return mark_table_.count(var); } + ir::Node* NodeMatch(ir::Node* var) const; // map store non-const iterator, can not promise const int GetIndex(ir::Node* var); @@ -67,6 +71,11 @@ class OrderedNodePairPool { ConstIter end() const { return nodes_.end(); } size_t size() const { return nodes_.size(); } + void Clear() { + mark_table_.clear(); + nodes_.clear(); + } + private: // for searching. std::unordered_map mark_table_; @@ -74,14 +83,53 @@ class OrderedNodePairPool { std::list nodes_; }; +// valid a tensor can be reuse or not +bool NodeCanReused(ir::Node* node); + +// valid a tensor can be reuse or not. +bool NodeCanReused(const VarDesc& node); + +// check op has subblock or not +bool OpHasSubBlock(OpDesc* desc); + // node memory size in bytes size_t NodeSizeInBytes(ir::Node* n); +// node memory size in bytes +size_t NodeSizeInBytes(const VarDesc&); + std::string DebugString(ir::Node* var); -// std::string DebugString(VarDesc* var); VarDesc* FindVarDescInBlock(ir::Node* n); +template +class FilterVariableImpl { + public: + void operator()(const Container& nodes, Callback callback) { + for (auto* node : nodes) { + callback(node); + } + } +}; + +// filter var node for op->inputs/outputs +template +class FilterVariableImpl, Callback> { + public: + void operator()(const std::vector& nodes, Callback callback) { + for (auto* var : nodes) { + if (var->IsVar() && !var->IsCtrlVar()) { + callback(var); + } + } + } +}; + +template +void FilterVariables(const Container& nodes, Callback callback) { + FilterVariableImpl()(nodes, callback); +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types_test.cc b/paddle/fluid/framework/details/memory_optimize_helper_test.cc similarity index 96% rename from paddle/fluid/framework/details/memory_reuse_types_test.cc rename to paddle/fluid/framework/details/memory_optimize_helper_test.cc index d2fabf5ce068e0f752b86c0d02b971f18fc65f01..f2b9baf14a34ace9cc860797280dbd519dfa4f2a 100644 --- a/paddle/fluid/framework/details/memory_reuse_types_test.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include #include #include @@ -27,8 +27,8 @@ namespace paddle { namespace framework { namespace details { -TEST(OrderedNodePairPool, Normal) { - OrderedNodePairPool pool; +TEST(OrderedNodeList, Normal) { + OrderedNodeList pool; std::vector> nodes; // clang-format off diff --git a/paddle/fluid/framework/details/analysis_var_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc similarity index 78% rename from paddle/fluid/framework/details/analysis_var_pass.cc rename to paddle/fluid/framework/details/memory_optimize_pass.cc index 223b9da3cfba33fc32d1334cddccb9f503bd0bef..85de14a60a8fe6958794f0ac25768b9da1943f9d 100644 --- a/paddle/fluid/framework/details/analysis_var_pass.cc +++ b/paddle/fluid/framework/details/memory_optimize_pass.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/analysis_var_pass.h" +#include "paddle/fluid/framework/details/memory_optimize_pass.h" #include #include #include @@ -48,39 +48,10 @@ static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { op1->Outputs() == op2->Outputs(); } -template -class FilterVariableImpl { - public: - void operator()(const Container& nodes, Callback callback) { - for (auto* node : nodes) { - callback(node); - } - } -}; - -// filter var node for op->inputs/outputs -template -class FilterVariableImpl, Callback> { - public: - void operator()(const std::vector& nodes, Callback callback) { - for (auto* var : nodes) { - if (var->IsVar() && !var->IsCtrlVar()) { - callback(var); - } - } - } -}; - -template -void FilterVariables(const Container& nodes, Callback callback) { - FilterVariableImpl()(nodes, callback); -} - -std::unique_ptr AnalysisVarPass::ApplyImpl( +std::unique_ptr MemoryOptimizePass::ApplyImpl( std::unique_ptr graph) const { auto nodes = graph->Nodes(); - auto subblock_vars = GetSubBlockVars(nodes); - skip_set_.insert(subblock_vars.begin(), subblock_vars.end()); + CollectSkipVarsSet(nodes); cfg_.reset(new details::ControlFlowGraph(*graph)); cfg_->LiveVariableAnalysis(); @@ -103,48 +74,53 @@ std::unique_ptr AnalysisVarPass::ApplyImpl( } for (auto& var : op->outputs) { - if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) { - ir::Node* cache = pool_.NodeMatch(var); - if (var->Name() == FLAGS_memory_optimize_debug) { - VLOG(3) << "start match var " << DebugString(var) << " of op " - << op->Name(); - VLOG(3) << pool_.ToString(); - VLOG(3) << "matched in pool : " - << ((cache == nullptr) ? "False" : "True"); - } - if (cache != nullptr) { - if (var->Name() == cache->Name()) { - VLOG(3) << "The same cache variable is cascade reused." - << var->Name() << " is re-filled to the pool after" - << "the reused op is finished. Current op can not " - << "replace it again. Skip this candidate."; - continue; - } + if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 || + skip_set_.count(var->Name())) + continue; + ir::Node* cache = pool_.NodeMatch(var); + + if (var->Name() == FLAGS_memory_optimize_debug) { + VLOG(3) << "start match var " << DebugString(var) << " of op " + << op->Name(); + VLOG(3) << pool_.ToString(); + VLOG(3) << "matched in pool : " + << ((cache == nullptr) ? "False" : "True"); + } - int node_idx_in_pool = pool_.GetIndex(cache); - VLOG(3) << string::Sprintf( - "!!! %s, %s => %s, cache idx %d, pool size %d", - std::to_string(reuse_id++), DebugString(var), DebugString(cache), - node_idx_in_pool, static_cast(pool_.size())); - // update CFG Graph on the fly. - // reused var maybe re-fill into the pool - cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); - // NOTE(dzhwinter): we need to both update the ProgramDesc - // and IR Graph. because op_desc/var_desc is used in CreateOp, - // CreateVar when running happens. But IR Graph - // define the dependence relationship between nodes. - RenameVarInGraphDesc(var->Name(), cache->Name(), idx); - RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get()); - - pool_.Erase(cache); + if (cache == nullptr) continue; + if (var->Name() == cache->Name()) { + VLOG(3) << "The same cache variable is cascade reused." << var->Name() + << " is re-filled to the pool after" + << "the reused op is finished. Current op can not " + << "replace it again. Skip this candidate."; + continue; + + int node_idx_in_pool = pool_.GetIndex(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(reuse_id++), DebugString(var), DebugString(cache), + node_idx_in_pool, static_cast(pool_.size())); + // update CFG Graph on the fly. + // reused var maybe re-fill into the pool + cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); + // NOTE(dzhwinter): we need to both update the ProgramDesc + // and IR Graph. because op_desc/var_desc is used in CreateOp, + // CreateVar when running happens. But IR Graph + // define the dependence relationship between nodes. + RenameVarInGraphDesc(var->Name(), cache->Name(), idx); + RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get()); + + pool_.Erase(cache); + } + // fill the pool + std::unordered_set unlived_vars; + for (auto var : cfg_->LiveIn(op)) { + if (cfg_->LiveOut(op).count(var) == 0) { + unlived_vars.emplace(var); } } - } - // fill the pool - for (auto var : cfg_->LiveIn(op)) { - if (cfg_->LiveOut(op).count(var) == 0) { + for (auto var : unlived_vars) { ir::Node* var_node = cfg_->GetNodeFromVarName(var, op); - if (var_node == nullptr) continue; if (NodeCanReused(var_node) && !pool_.Has(var_node)) { pool_.Insert(var_node, op); } @@ -177,7 +153,7 @@ std::unique_ptr AnalysisVarPass::ApplyImpl( return graph; } -void AnalysisVarPass::SubGraphOptimize(OpDesc* op_desc) const { +void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { // conditional block, while op and their grad op auto* sub_block_desc = AttrReader(op_desc->GetAttrMap()).Get("sub_block"); @@ -247,25 +223,32 @@ void AnalysisVarPass::SubGraphOptimize(OpDesc* op_desc) const { } } -std::unordered_set AnalysisVarPass::GetSubBlockVars( +void MemoryOptimizePass::CollectSkipVarsSet( const std::unordered_set& nodes) const { - std::unordered_set vars; + auto update_skip_set = [&](OpDesc* op_desc) { + auto inputs = op_desc->InputArgumentNames(); + auto outputs = op_desc->OutputArgumentNames(); + skip_set_.insert(inputs.begin(), inputs.end()); + skip_set_.insert(outputs.begin(), outputs.end()); + }; for (auto& op : nodes) { if (!op->IsOp() || op->Op() == nullptr) continue; auto* op_desc = op->Op(); - if (OpHasSubBlock(op_desc)) { - auto inputs = op_desc->InputArgumentNames(); - auto outputs = op_desc->OutputArgumentNames(); - vars.insert(inputs.begin(), inputs.end()); - vars.insert(outputs.begin(), outputs.end()); - } + // NOTE(dzhwinter): + // current block can not reuse next level block vars. + if (OpHasSubBlock(op_desc)) update_skip_set(op_desc); + // NOTE(dzhwinter): + // distributed ops input/output name need to + // keep same bettwen trainer/pserver + if (op_desc->Type() == "send") update_skip_set(op_desc); + if (op_desc->Type() == "recv") update_skip_set(op_desc); + if (op_desc->Type() == "prefetch") update_skip_set(op_desc); } - return vars; } -void AnalysisVarPass::RenameVarInGraphDesc(const std::string& var, - const std::string& cache_var, - size_t idx) const { +void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, + const std::string& cache_var, + size_t idx) const { for (size_t i = idx; i < cfg_->Ops().size(); ++i) { auto* op = cfg_->Ops()[i]; PADDLE_ENFORCE(op->IsOp() && op->Op()); @@ -277,7 +260,7 @@ void AnalysisVarPass::RenameVarInGraphDesc(const std::string& var, } } -void AnalysisVarPass::InitSSAGraphNodes() const { +void MemoryOptimizePass::InitSSAGraphNodes() const { std::unordered_map> all_vars; if (var_nodes_.empty()) { for (auto* op : cfg_->Ops()) { @@ -297,9 +280,10 @@ void AnalysisVarPass::InitSSAGraphNodes() const { } } -void AnalysisVarPass::RenameVarInGraphNode(const std::string& var, - const std::string& cache_var, - size_t idx, ir::Graph* graph) const { +void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, + const std::string& cache_var, + size_t idx, + ir::Graph* graph) const { // if replace happens, we need to create a newer version cache_var // but use the same dims/data_type with var. PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && @@ -358,39 +342,6 @@ void AnalysisVarPass::RenameVarInGraphNode(const std::string& var, var_nodes_.at(var).clear(); } -bool AnalysisVarPass::NodeCanReused(ir::Node* node) const { - if (!node->IsVar() || node->IsCtrlVar()) return false; - auto* desc = node->Var(); - auto type = desc->GetType(); - if (desc->Persistable() || type != proto::VarType::LOD_TENSOR || - desc->GetShape().empty()) { - return false; - } - // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad - std::string name = node->Name(); - if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') - return false; - if (skip_set_.count(name)) return false; - for (auto* op : node->inputs) { - if (op->Op()->HasAttr("force_cpu")) { - // op output force generated in cpu, can not be reused. - return framework::AttrReader(op->Op()->GetAttrMap()) - .Get("force_cpu") == 0; - } - } - return true; -} - -bool AnalysisVarPass::OpHasSubBlock(OpDesc* desc) const { - const AttributeMap& attrs = desc->GetAttrMap(); - for (auto& attr : attrs) { - if (attr.second.type() == typeid(BlockDesc*) || // NOLINT - attr.second.type() == typeid(std::vector)) // NOLINT - return true; - } - return false; -} - std::vector SortOpLikeDescOrder(const ir::Graph& graph) { PADDLE_ENFORCE(graph.Has(kAllOpDescs), "Graph has no attribute of kAllOpDescs."); @@ -651,6 +602,7 @@ ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name, } // namespace framework } // namespace paddle -REGISTER_PASS(analysis_var_pass, paddle::framework::details::AnalysisVarPass) +REGISTER_PASS(memory_optimize_pass, + paddle::framework::details::MemoryOptimizePass) .RequireGraphAttr(paddle::framework::details::kGraphNodePool) .RequireGraphAttr(paddle::framework::details::kAllOpDescs); diff --git a/paddle/fluid/framework/details/analysis_var_pass.h b/paddle/fluid/framework/details/memory_optimize_pass.h similarity index 85% rename from paddle/fluid/framework/details/analysis_var_pass.h rename to paddle/fluid/framework/details/memory_optimize_pass.h index 144204beafb341351172c29e3b4cd41db49be6f9..3d6b1897f3b5106054b8f647f9cf613ebd1d65ff 100644 --- a/paddle/fluid/framework/details/analysis_var_pass.h +++ b/paddle/fluid/framework/details/memory_optimize_pass.h @@ -25,7 +25,7 @@ #include #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" @@ -35,12 +35,10 @@ namespace details { constexpr char kAllOpDescs[] = "all_op_descs"; std::vector SortOpLikeDescOrder(const ir::Graph& graph); -// sort op in bfs order -std::vector BFSSortGraphOps(const ir::Graph& graph); class ControlFlowGraph; -class AnalysisVarPass : public ir::Pass { +class MemoryOptimizePass : public ir::Pass { protected: std::unique_ptr ApplyImpl( std::unique_ptr graph) const override; @@ -57,17 +55,14 @@ class AnalysisVarPass : public ir::Pass { ir::Graph* graph) const; void SubGraphOptimize(OpDesc* op_desc) const; - // valid a tensor can be reuse or not - bool NodeCanReused(ir::Node* node) const; - // scan subblock and collect the output/input variables. - std::unordered_set GetSubBlockVars( - const std::unordered_set&) const; - // check op has subblock or not - bool OpHasSubBlock(OpDesc* desc) const; + // 1. scan op with subblock and collect the output/input vars. + // while, while_grad, conditional_block + // 2. scan distributed ops and collect the output/input vars + void CollectSkipVarsSet(const std::unordered_set&) const; private: // Reuse Node Pool, Owned. - mutable OrderedNodePairPool pool_; + mutable OrderedNodeList pool_; // controlflow Graph mutable std::unique_ptr cfg_; // skip set diff --git a/paddle/fluid/framework/details/analysis_var_pass_test.cc b/paddle/fluid/framework/details/memory_optimize_pass_test.cc similarity index 90% rename from paddle/fluid/framework/details/analysis_var_pass_test.cc rename to paddle/fluid/framework/details/memory_optimize_pass_test.cc index 9bc4fd33f7058949ca60983ea666a21cb4877b3e..3d3dfa93594d496431f7cb60dceb26f20250fc16 100644 --- a/paddle/fluid/framework/details/analysis_var_pass_test.cc +++ b/paddle/fluid/framework/details/memory_optimize_pass_test.cc @@ -12,63 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/analysis_var_pass.h" +#include "paddle/fluid/framework/details/memory_optimize_pass.h" #include #include #include #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/fluid/framework/details/graph_test_base.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -namespace paddle { -namespace framework { - -class DummyOp : public OperatorBase { - public: - DummyOp(const std::string& type, const VariableNameMap& inputs, - const VariableNameMap& outputs, const AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const Scope& scope, - const platform::Place& place) const override {} -}; - -class SumOpMaker : public OpProtoAndCheckerMaker { - public: - void Make() { - AddInput("X", "").AsDuplicable(); - AddOutput("Out", ""); - AddComment(""); - } -}; - -class AssignOpMaker : public OpProtoAndCheckerMaker { - public: - void Make() { - AddInput("X", "").AsDuplicable(); - AddOutput("Out", ""); - AddComment(""); - } -}; - -class DummyVarTypeInference : public VarTypeInference { - public: - void operator()(const OpDesc& op_desc, BlockDesc* block) const override { - auto& inputs = op_desc.Input("X"); - auto type = block->Var(inputs.front())->GetType(); - auto out_var_name = op_desc.Output("Out").front(); - block->Var(out_var_name)->SetType(type); - } -}; - -} // namespace framework -} // namespace paddle - REGISTER_OPERATOR(sum, paddle::framework::DummyOp, paddle::framework::SumOpMaker, paddle::framework::DummyVarTypeInference); @@ -141,15 +97,6 @@ inline static ProgramDesc FillProgramDesc() { return prog; } -template -inline static std::string DebugString(const Container& c) { - std::stringstream ss; - for (auto& item : c) { - ss << item << " "; - } - return ss.str(); -} - TEST(CFGGraph, IRGraph) { // prepare ir graph auto prog = FillProgramDesc(); diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h index eea7e712f8f6e187cdceedce77cc76d1d4ca2101..0901e59f9786b43361e7a570f8c2a07be54c1ac2 100644 --- a/paddle/fluid/framework/details/op_registry.h +++ b/paddle/fluid/framework/details/op_registry.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/grad_op_desc_maker.h" +#include "paddle/fluid/framework/inplace_op_inference.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" @@ -32,7 +33,8 @@ enum OpInfoFillType { kOpProtoAndCheckerMaker = 1, kGradOpDescMaker = 2, kVarTypeInference = 3, - kShapeInference = 4 + kShapeInference = 4, + kInplaceOpInference = 5 }; template @@ -48,8 +50,11 @@ struct OpInfoFillTypeID { ? kVarTypeInference : (std::is_base_of::value ? kShapeInference - : static_cast( - -1))))); + : (std::is_base_of< + InplaceOpInference, T>::value + ? kInplaceOpInference + : static_cast( + -1)))))); } }; @@ -139,6 +144,16 @@ struct OpInfoFiller { } }; +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->infer_inplace_ = [](const OpDesc& op_desc, BlockDesc* block) { + T infer; + return infer(op_desc, block); + }; + } +}; + } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h new file mode 100644 index 0000000000000000000000000000000000000000..03ab2a2b6c5dc07805fddddc3ac53f61e7b6a697 --- /dev/null +++ b/paddle/fluid/framework/inplace_op_inference.h @@ -0,0 +1,115 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/details/memory_optimize_helper.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/type_defs.h" + +namespace paddle { +namespace framework { + +/* + Inplace Inference for create In->Out pairs for inplaced operator. + If we specify a pair of corresponding names. For example, X->Out. + then Out will inplaced use X's memory. The base class will do + legality validation for both variables. +*/ +class InplaceOpInference { + public: + virtual ~InplaceOpInference() {} + virtual std::unordered_map operator()( + const OpDesc& op_desc, BlockDesc* block) const = 0; +}; + +class InplaceInToOut : public InplaceOpInference { + public: + std::unordered_map operator()( + const OpDesc& op_desc, BlockDesc* block) const { + std::unordered_map ret; + auto in_out_var_names_pair = this->Apply(op_desc, block); + for (auto& pair : in_out_var_names_pair) { + PADDLE_ENFORCE(!op_desc.Input(pair.first).empty(), + string::Sprintf("op %s do not have input of %s!", + op_desc.Type(), pair.first)); + PADDLE_ENFORCE(!op_desc.Output(pair.second).empty(), + string::Sprintf("op %s do not have output of %s!", + op_desc.Type(), pair.second)); + auto& in_name = op_desc.Input(pair.first).at(0); + auto& out_name = op_desc.Output(pair.second).at(0); + + auto in = block->FindRecursiveOrCreateVar(in_name); + auto out = block->FindRecursiveOrCreateVar(out_name); + if (TryInplaceInputOutput(in, out)) ret.insert({in_name, out_name}); + } + return ret; + } + + protected: + virtual std::unordered_map Apply( + const OpDesc& op_desc, BlockDesc* block) const = 0; + + bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const { + return in.Name() != out.Name() && details::NodeCanReused(in) && + details::NodeCanReused(out) && + details::NodeSizeInBytes(out) <= details::NodeSizeInBytes(in); + } +}; + +/* + Inplace In and Out for operator only have an Input and an Output. + For example, activation op. + */ +class SingleOpInplaceInToOut : public InplaceInToOut { + protected: + std::unordered_map Apply( + const OpDesc& op_desc, BlockDesc* block) const override { + PADDLE_ENFORCE(!op_desc.InputNames().empty(), + "Op inputs must not be empty"); + PADDLE_ENFORCE(!op_desc.OutputNames().empty(), + "Op outputs must not be empty"); + auto x_name = op_desc.InputNames().at(0); + auto out_name = op_desc.OutputNames().at(0); + return std::unordered_map{{x_name, out_name}}; + } +}; + +/* + Gradient op. Inplace output use it's Input. + For example, Input@Grad->Input reuse strategy. + */ +class GradOpInplaceInToOut : public InplaceInToOut { + protected: + std::unordered_map Apply( + const OpDesc& op_desc, BlockDesc* block) const override { + std::unordered_map ret; + std::unordered_set output_names(op_desc.OutputNames().begin(), + op_desc.OutputNames().end()); + for (auto& input_name : op_desc.InputNames()) { + if (output_names.count(GradVarName(input_name))) { + ret.insert({input_name, GradVarName(input_name)}); + } + } + return ret; + } +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3e4d715c6f089496d1b1f7906e3f10147a073622 --- /dev/null +++ b/paddle/fluid/framework/inplace_op_inference_test.cc @@ -0,0 +1,288 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace framework { + +class NOP : public OperatorBase { + public: + NOP(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const Scope& scope, + const platform::Place& place) const override {} +}; + +class SingleOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class SingleGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("single_op_grad"); + op->SetInput("Out", OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return std::unique_ptr(op); + } +}; + +class SingleOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + ctx->HasInput("X"); + ctx->HasOutput("Out"); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + } +}; + +class SingleGradOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + ctx->HasInput(framework::GradVarName("Out")); + ctx->HasOutput(framework::GradVarName("X")); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out")); + } +}; + +class MultiOutOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddInput("Y", "").AsDuplicable(); + AddInput("Z", "").AsDuplicable(); + AddOutput("Out", ""); + AddOutput("YOut", ""); + AddOutput("ZOut", ""); + AddOutput("NotReuseOut", ""); + AddComment(""); + } +}; + +class MultiOutShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + ctx->ShareDim("X", "Out"); + ctx->ShareDim("Y", "YOut"); + ctx->ShareDim("Z", "ZOut"); + } +}; + +class MultiGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("multi_out_grad"); + op->SetInput("X", Input("X")); + op->SetOutput(framework::GradVarName("Y"), OutputGrad("YOut")); + op->SetOutput(framework::GradVarName("X"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("Z"), OutputGrad("ZOut")); + return std::unique_ptr(op); + } +}; + +class MultiOutGradShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim(framework::GradVarName("Y"), + ctx->GetInputDim(framework::GradVarName("YOut"))); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); + ctx->SetOutputDim(framework::GradVarName("Z"), + ctx->GetInputDim(framework::GradVarName("ZOut"))); + } +}; + +class MultiOutInplaceInToOut : public framework::InplaceInToOut { + public: + using framework::InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const OpDesc& op_desc, BlockDesc* block) const override { + return std::unordered_map{ + {"X", "Out"}, {"Y", "YOut"}, {"Z", "ZOut"}, + }; + } +}; + +class MultiOutGradInplaceInToOut : public framework::InplaceInToOut { + public: + using framework::InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const OpDesc& op_desc, BlockDesc* block) const override { + return std::unordered_map{ + {framework::GradVarName("YOut"), framework::GradVarName("Y")}, + {framework::GradVarName("Out"), framework::GradVarName("X")}, + {framework::GradVarName("ZOut"), framework::GradVarName("Z")}, + }; + } +}; + +} // namespace framework +} // namespace paddle + +namespace f = paddle::framework; +REGISTER_OPERATOR(single_op, f::NOP, f::SingleOpMaker, f::SingleGradOpMaker, + f::SingleOpInplaceInToOut, f::SingleOpShapeInference); +REGISTER_OPERATOR(single_op_grad, f::NOP, f::SingleOpInplaceInToOut, + f::SingleGradOpShapeInference); +REGISTER_OPERATOR(multi_out_op, f::NOP, f::MultiOutOpMaker, f::MultiGradOpMaker, + f::MultiOutInplaceInToOut, f::MultiOutShapeInference); +REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut, + f::MultiOutGradShapeInference); + +namespace paddle { +namespace framework { + +TEST(InferInplace, SingleOpInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("single_op"); + op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); + op->SetOutput("Out", {"test2_out"}); + + prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64}); + prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_out"); + prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); + + auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; + auto in_to_outs = infer_inplace(*op, op->Block()); + EXPECT_EQ(in_to_outs.size(), 1ul); + auto it = in_to_outs.begin(); + EXPECT_EQ(it->first, "test2_a"); + EXPECT_EQ(it->second, "test2_out"); +} + +TEST(InferInplace, SingleGradOpInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("single_op_grad"); + op->SetInput(GradVarName("Out"), {"test2_out"}); + op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"}); + + prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("test2_out"); + prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); + + auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; + auto in_to_outs = infer_inplace(*op, op->Block()); + EXPECT_EQ(in_to_outs.size(), 1ul); + auto it = in_to_outs.begin(); + EXPECT_EQ(it->first, "test2_out"); + EXPECT_EQ(it->second, "test2_a"); +} + +TEST(InferInplace, MultiOutInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("multi_out_op"); + op->SetInput("X", {"a0", "a1"}); + op->SetInput("Y", {"b0"}); + op->SetInput("Z", {"c0", "c1"}); + op->SetOutput("Out", {"o0"}); + op->SetOutput("YOut", {"y0"}); + op->SetOutput("ZOut", {"z0"}); + + prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("o0"); + prog.MutableBlock(0)->Var("y0"); + prog.MutableBlock(0)->Var("z0"); + prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); + + auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; + auto in_to_outs = infer_inplace(*op, op->Block()); + EXPECT_EQ(in_to_outs.size(), 3ul); + std::unordered_map expects = { + {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"}, + }; + EXPECT_TRUE(expects == in_to_outs); +} + +TEST(InferInplace, MultiGradInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("multi_out_grad"); + op->SetInput(GradVarName("Out"), {"o0"}); + op->SetInput(GradVarName("YOut"), {"y0"}); + op->SetInput(GradVarName("ZOut"), {"z0"}); + op->SetOutput(GradVarName("X"), {"a0", "a1"}); + op->SetOutput(GradVarName("Y"), {"b0"}); + op->SetOutput(GradVarName("Z"), {"c0", "c1"}); + + prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("o0"); + prog.MutableBlock(0)->Var("y0"); + prog.MutableBlock(0)->Var("z0"); + prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); + + auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; + auto in_to_outs = infer_inplace(*op, op->Block()); + + EXPECT_EQ(in_to_outs.size(), 3ul); + std::unordered_map expects = { + {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, + }; + EXPECT_TRUE(expects == in_to_outs); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 914bcce7755bcf0651da29dd669f5d6d14e081d2..07c2c970d4de3cecf03e4cf80e60e81e7a9595a8 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -65,6 +65,7 @@ pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference) pass_library(conv_affine_channel_fuse_pass inference) pass_library(transpose_flatten_concat_fuse_pass inference) +pass_library(identity_scale_op_clean_pass base) # There may be many transpose-flatten structures in a model, and the output of # these structures will be used as inputs to the concat Op. This pattern will diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 8bb3c27bdd32d07d58913db043569f6a3bf69aeb..b7f7c3d82e0da4d3ca8795487fa52fba0394e365 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -141,7 +141,8 @@ class Graph { ir::Node *CreateControlDepVar() { // TODO(panyx0718): control var name should be really unique. const std::string name = string::Sprintf( - "%s@%llu", ir::Node::kControlDepVarName, node_set_.size()); + "%s@%llu", static_cast(ir::Node::kControlDepVarName), + node_set_.size()); auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable)); x->SetId(num_node_created_++); return x; diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 8de93cf285e4bf34c2d2bf425fa5f3459704b3d6..22d4c0a91cc1638264a8c57aa2841ff4e65a1400 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -52,16 +52,29 @@ bool HasCircleHelper( ir::Node *node, const std::map> &adj_list, std::unordered_set *visited, - std::unordered_set *in_trace) { + std::unordered_set *in_trace, + std::vector> *circles) { if (visited->find(node) == visited->end()) { visited->insert(node); in_trace->insert(node); for (ir::Node *in : adj_list.at(node)) { if (visited->find(in) == visited->end() && - HasCircleHelper(in, adj_list, visited, in_trace)) { + HasCircleHelper(in, adj_list, visited, in_trace, circles)) { return true; } else if (in_trace->find(in) != in_trace->end()) { + if (circles != nullptr) { + std::vector circle; + circle.emplace_back(in); + ir::Node *p = in; + for (auto &adj : adj_list.at(p)) { + if (in_trace->count(adj)) { + circle.emplace_back(adj); + p = adj; + } + } + circles->emplace_back(circle); + } return true; } } @@ -71,11 +84,12 @@ bool HasCircleHelper( } bool HasCircleInternal( - const std::map> &adj_list) { + const std::map> &adj_list, + std::vector> *circles) { std::unordered_set visited; std::unordered_set in_trace; for (auto &adj : adj_list) { - if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace)) { + if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace, circles)) { return true; } } @@ -84,13 +98,18 @@ bool HasCircleInternal( } // namespace bool HasCircle(const Graph &graph) { - return HasCircleInternal(BuildOperationAdjList(graph)); + return HasCircleInternal(BuildOperationAdjList(graph), nullptr); +} + +bool FindCircleSubGraph(const Graph &graph, + std::vector> *circles) { + return HasCircleInternal(BuildOperationAdjList(graph), circles); } std::vector TopologySortOperations(const Graph &graph) { std::map> adj_list = BuildOperationAdjList(graph); - PADDLE_ENFORCE(!HasCircleInternal(adj_list)); + PADDLE_ENFORCE(!HasCircleInternal(adj_list, nullptr)); std::unordered_set visited; std::vector ret; for (auto adj : adj_list) { diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index fba4936f2c5c971f6c63a452ec4480ff091db25c..214de9ec7d85aee6021b18866295777e317aa79d 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -28,6 +28,11 @@ namespace ir { // Test if the graph contains circle. bool HasCircle(const Graph &graph); +// Find All Circles for debugging, +// store all subgraph in circles. +bool FindCircleSubGraph(const Graph &graph, + std::vector> *circles); + size_t GraphNum(const Graph &graph); // Topology Sort the operations in the graph from inputs to outputs. diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc index 260a73ae763bd2cdea9948e4d928377a7c718dda..d8973d5aeda1a2e0650a506b4c916b4346f01e2d 100644 --- a/paddle/fluid/framework/ir/graph_helper_test.cc +++ b/paddle/fluid/framework/ir/graph_helper_test.cc @@ -195,6 +195,17 @@ void BuildTwoGraphs(Graph* g) { // v4->outputs.push_back(o5); } +TEST(GraphHelperTest, Circles) { + ProgramDesc prog; + + Graph g(prog); + BuildCircleGraph(&g); + + std::vector> circles; + ASSERT_TRUE(FindCircleSubGraph(g, &circles)); + ASSERT_EQ(circles.size(), 1UL); +} + TEST(GraphHelperTest, GraphNum) { ProgramDesc prog; diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 6282ced1e47329915bb3626b410e55ad8251071d..9ea0729e1f3339c2f17371ecc8fa51325b9629bb 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -117,11 +117,6 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) { // return false; } } - for (auto &item : pdnodes2nodes_) { - for (auto &n : item.second) { - GetMarkedNodes(const_cast(&graph)).insert(n); - } - } VLOG(3) << pdnodes2nodes_.size() << " nodes marked"; return !pdnodes2nodes_.empty(); diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..3b738aa159ebfd77f00c9e532fbd94542e2097db --- /dev/null +++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/identity_scale_op_clean_pass.h" +#include +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr IdentityScaleOpCleanPass::ApplyImpl( + std::unique_ptr graph) const { + FusePassBase::Init("identity_scale_op_clean", graph.get()); + + // pre_op -> scale_in -> scale_op -> scale_out + // -> + // pre_op -> scale_out + GraphPatternDetector detector; + auto pre_op = detector.mutable_pattern()->NewNode("pre_op")->assert_is_op(); + auto scale_in = detector.mutable_pattern() + ->NewNode("scale_in") + ->assert_is_op_input("scale") + ->AsIntermediate(); + auto scale_op = detector.mutable_pattern() + ->NewNode("scale_fuse") + ->assert_is_op("scale") + ->assert_op_attr("scale", 1.) + ->assert_op_attr("bias", 0.); + auto scale_out = detector.mutable_pattern() + ->NewNode("scale_out") + ->assert_is_op_output("scale"); + + pre_op->LinksTo({scale_in}); + scale_op->LinksFrom({scale_in}).LinksTo({scale_out}); + + GraphPatternDetector::handle_t handler = [&]( + const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + Node* scale_op_var = subgraph.at(scale_op); + Node* scale_in_var = subgraph.at(scale_in); + Node* scale_out_var = subgraph.at(scale_out); + Node* pre_op_var = subgraph.at(pre_op); + // Link pre_op directly to scale_out + const std::string scale_in_name = scale_in_var->Name(); + const std::string scale_out_name = scale_out_var->Name(); + // Remove links in graph + GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var}); + // Modify proto message + auto* pre_op_desc = pre_op_var->Op(); + for (auto& parameter : *pre_op_desc->Proto()->mutable_outputs()) { + auto* arguments = parameter.mutable_arguments(); + auto it = std::find(arguments->begin(), arguments->end(), scale_in_name); + PADDLE_ENFORCE(it != arguments->end()); + *it = scale_out_name; + } + + IR_NODE_LINK_TO(pre_op_var, scale_out_var); + }; + + detector(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(identity_scale_op_clean_pass, + paddle::framework::ir::IdentityScaleOpCleanPass); diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..50a654d82f0e4fb7e8e91c665397716407e6d2a5 --- /dev/null +++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class IdentityScaleOpCleanPass : public FusePassBase { + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + + private: + virtual ~IdentityScaleOpCleanPass() = default; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h index 19e5c2c73eac74dee030a4f7820531800f737e4e..4b55bd0703eee399cd841f90ea0b18d8fbdc67e8 100644 --- a/paddle/fluid/framework/op_info.h +++ b/paddle/fluid/framework/op_info.h @@ -38,6 +38,7 @@ struct OpInfo { OpAttrChecker* checker_{nullptr}; InferVarTypeFN infer_var_type_; InferShapeFN infer_shape_; + InferInplaceOpFN infer_inplace_; bool HasOpProtoAndChecker() const { return proto_ != nullptr && checker_ != nullptr; diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 938e2024c3359c2acd65a1aa4af875a8350e4c58..d02c699b979d7693bd83fd43fc73f7e0aeddb0cc 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -57,5 +57,8 @@ using InferVarTypeFN = using InferShapeFN = std::function; +using InplacePair = std::unordered_map; +using InferInplaceOpFN = std::function; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 5db422119966948f75970874e13d416ea699158a..ec8dedd605235a2d197e6a313bd589d5b9520cdf 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,5 +1,5 @@ if(WITH_PYTHON) -cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas) -cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context) +cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind) +cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind) cc_library(engine SRCS engine.cc) endif() diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 11484a647303b32a6006bef3cfe4be6b3f0d533d..157862016e3556902f6507e02417624363ed1029 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -58,12 +58,13 @@ if(WIN32) sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) - target_link_libraries(paddle_fluid_shared shlwapi) else(WIN32) cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) endif() +get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) +target_link_libraries(paddle_fluid_shared ${os_dependency_modules}) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) if(NOT APPLE AND NOT WIN32) diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index fe3c841186c35ea28c1d44007d91de5b997c1388..7476c199cfd073ec0962fa9a48f24750a6484bb5 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -83,7 +83,6 @@ void IRPassManager::CreatePasses(Argument *argument, new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); } - // graph_ = pass->Apply(std::move(graph_)); pre_pass = pass_name; passes_.emplace_back(std::move(pass)); @@ -97,8 +96,9 @@ std::unique_ptr IRPassManager::Apply(std::unique_ptr graph) { PADDLE_ENFORCE(graph.get()); // Apply all the passes for (const auto &pass : passes_) { - if (pass->Type() == "graph_viz_pass") continue; - PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type()); + if (pass->Type() != "graph_viz_pass") { + PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type()); + } graph = pass->Apply(std::move(graph)); } return std::move(graph); diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index eb6e1768a2c01f1388962eefe8e70368cae8cf8b..410a90132aa7657a23b858570763547fe53730a0 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -1,4 +1,7 @@ cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) +if(WITH_TESTING) + add_dependencies(subgraph_detector gtest) +endif() if (WITH_GPU AND TENSORRT_FOUND) cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller) diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index 3d1be9196fdeacd8ff852dbb595473a687352ccf..4b0a9d9b1c48fcb0d5e44ec1b977c817f3c70b2e 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include "paddle/fluid/framework/ir/graph_helper.h" @@ -168,7 +169,11 @@ bool FindSuitableTensorToReuse( if (!cluster->count(candidate)) continue; size_t space = space_table.at(candidate); - size_t space_diff = std::abs(space - space_required); + PADDLE_ENFORCE( + space <= std::numeric_limits::type>::max(), + "space overload"); + size_t space_diff = + std::abs((std::make_signed::type)space - space_required); if (space_diff < best_fit.second) { best_fit.first = candidate; best_fit.second = space_diff; diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index eecab238a88e90399eb70f17caa57633af4e2a69..e92273b4dd94f11e0e90c91fd82dafe42bf158f3 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -318,4 +318,9 @@ NativeConfig AnalysisConfig::ToNativeConfig() const { return config; } +void AnalysisConfig::SwitchIrDebug(int x) { + ir_debug_ = x; + Update(); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 14d6ba8c56dc3fe04e27bccadd5a5155547398a4..da2e9803f0467f2b83d79cdd06d4317d41630b04 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -58,7 +58,8 @@ namespace { bool IsPersistable(const framework::VarDesc *var) { if (var->Persistable() && var->GetType() != framework::proto::VarType::FEED_MINIBATCH && - var->GetType() != framework::proto::VarType::FETCH_LIST) { + var->GetType() != framework::proto::VarType::FETCH_LIST && + var->GetType() != framework::proto::VarType::RAW) { return true; } return false; diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 6d11b461082d0ed8ba08c9e280bba86737b86e71..002ba90e40e69d565f5a54e374a3f0083b84273f 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -196,7 +196,7 @@ TEST(AnalysisPredictor, memory_optim) { AnalysisConfig config(FLAGS_dirname); config.DisableGpu(); config.EnableMemoryOptim(true); - config.pass_builder()->TurnOnDebug(); + config.SwitchIrDebug(); auto native_predictor = CreatePaddlePredictor(config.ToNativeConfig()); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 9d9ed6a39d8324002a8850deae9bb8dd5af7ef9b..47361b3279e14dd65a0e6e7f864e508ef1183045 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -140,9 +140,12 @@ struct AnalysisConfig { */ bool tensorrt_engine_enabled() const { return use_tensorrt_; } - /** Control whther to debug IR graph analysis phase. + /** \brief Control whether to debug IR graph analysis phase. + * + * This will generate DOT files for visualizing the computation graph after + * each analysis pass applied. */ - void SwitchIrDebug(int x = true) { ir_debug_ = x; } + void SwitchIrDebug(int x = true); /** Turn on MKLDNN. */ diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 391932a1ee018c45818457c55fd8f82a22ab7405..aa353f12ca7333713e2d640cce6b2dfbea3c4e26 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -117,6 +117,7 @@ class CpuPassStrategy : public PassStrategy { "conv_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", // "is_test_pass", // + "identity_scale_op_clean_pass", // }); use_gpu_ = false; } @@ -155,6 +156,7 @@ class GpuPassStrategy : public PassStrategy { GpuPassStrategy() : PassStrategy({}) { passes_.assign({ "infer_clean_graph_pass", // + "identity_scale_op_clean_pass", // "conv_affine_channel_fuse_pass", // "conv_eltwiseadd_affine_channel_fuse_pass", // "conv_bn_fuse_pass", // diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index 8be2a6d79b2ede2c149aa523e38c3960ab30acb1..dd953e0dccbb3749bfcc87966453c6976dfefa10 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -142,7 +142,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) { cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); cfg->DisableGpu(); cfg->SwitchSpecifyInputNames(); - cfg->pass_builder()->TurnOnDebug(); + cfg->SwitchIrDebug(); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); if (use_mkldnn) { cfg->EnableMKLDNN(); diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 2db297e2005c6b657259187d6b6b76657d9e4388..2003be82019333ca97b9fa8ef83668825fe5710d 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -69,7 +69,7 @@ void SetInput(std::vector> *inputs) { TEST(Analyzer_Text_Classification, profile) { AnalysisConfig cfg; SetConfig(&cfg); - cfg.pass_builder()->TurnOnDebug(); + cfg.SwitchIrDebug(); std::vector outputs; std::vector> input_slots_all; diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc index 80763160df3adfd8c34e66bc7a5370808b349e76..0c48c2db9b691ae8cf587f2729c2789d4ce2dbe1 100644 --- a/paddle/fluid/inference/utils/benchmark_tester.cc +++ b/paddle/fluid/inference/utils/benchmark_tester.cc @@ -34,6 +34,6 @@ TEST(Benchmark, PersistToFile) { benchmark.SetLatency(220); benchmark.PersistToFile("1.log"); - benchmark.PersistToFile("1.log"); - benchmark.PersistToFile("1.log"); + benchmark.PersistToFile("2.log"); + benchmark.PersistToFile("3.log"); } diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 8759ec8096cf102ab85d2c2a91eddc23a6ed0e50..46aca62a27490b0141245beeea9a3e61d0fee651 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -59,11 +59,6 @@ size_t memory_usage(const platform::Place &p); using BuddyAllocator = detail::BuddyAllocator; -std::unordered_map> - gpu_mem_info; - BuddyAllocator *GetCPUBuddyAllocator() { // We tried thread_local for inference::RNN1 model, but that not works much // for multi-thread test. @@ -144,6 +139,8 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { devices = platform::GetSelectedDevices(); int gpu_num = devices.size(); + allocation::GPUMemMonitor.Initialize(devices.size()); + a_arr = new BuddyAllocator *[gpu_num]; for (size_t i = 0; i < devices.size(); ++i) { int dev_id = devices[i]; @@ -190,25 +187,19 @@ void *Alloc(const platform::CUDAPlace &place, platform::SetDeviceId(place.device); size_t avail, total; platform::GpuMemoryUsage(&avail, &total); - LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size) - << " in GPU " << place.device << ", available " - << string::HumanReadableSize(avail); - LOG(WARNING) << "total " << total; - LOG(WARNING) << "GpuMinChunkSize " - << string::HumanReadableSize( - buddy_allocator->GetMinChunkSize()); - LOG(WARNING) << "GpuMaxChunkSize " - << string::HumanReadableSize( - buddy_allocator->GetMaxChunkSize()); - LOG(WARNING) << "GPU memory used: " - << string::HumanReadableSize(Used(place)); + LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size) + << " in GPU " << place.device << ", available " + << string::HumanReadableSize(avail) << "total " << total + << "GpuMinChunkSize " + << string::HumanReadableSize(buddy_allocator->GetMinChunkSize()) + << "GpuMaxChunkSize " + << string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()) + << "GPU memory used: " + << string::HumanReadableSize(Used(place)); platform::SetDeviceId(cur_dev); } else { - gpu_mem_info[place.device].first += size; - if (gpu_mem_info[place.device].first > gpu_mem_info[place.device].second) { - gpu_mem_info[place.device].second = gpu_mem_info[place.device].first; - VLOG(3) << "device: " << place.device << " peak memory usage : " - << (gpu_mem_info[place.device].second >> 20) << " MiB"; + if (VLOG_IS_ON(3)) { + allocation::GPUMemMonitor.Add(place.device, size); } if (FLAGS_init_allocated_mem) { cudaMemset(ptr, 0xEF, size); @@ -225,7 +216,9 @@ void Free(const platform::CUDAPlace &place, void *p, size_t size) { #ifdef PADDLE_WITH_CUDA GetGPUBuddyAllocator(place.device)->Free(p); - gpu_mem_info[place.device].first -= size; + if (VLOG_IS_ON(3)) { + allocation::GPUMemMonitor.Minus(place.device, size); + } #else PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); #endif @@ -264,7 +257,7 @@ void *Alloc(const platform::CUDAPinnedPlace &place, void *ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { - LOG(WARNING) << "cudaMallocHost Cannot allocate " << size + LOG(WARNING) << "cudaHostAlloc Cannot allocate " << size << " bytes in CUDAPinnedPlace"; } if (FLAGS_init_allocated_mem) { @@ -335,6 +328,8 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { namespace allocation { +LegacyMemMonitor GPUMemMonitor; + Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_); return new Allocation(ptr, size, place_); @@ -346,6 +341,63 @@ void LegacyAllocator::Free(Allocation *allocation) { allocation->place()); delete allocation; } + +bool MemInfo::Add(const size_t &size) { + std::lock_guard lock(mutex_); + usage_ += size; + bool peak_point = usage_ > peak_usage_; + if (peak_point) peak_usage_ = usage_; + return peak_point; +} + +void MemInfo::Minus(const size_t &size) { + std::lock_guard lock(mutex_); + usage_ -= size; +} + +uint64_t MemInfo::GetPeakUsage() { return peak_usage_; } + +LegacyMemMonitor::~LegacyMemMonitor() { + for (auto &item : gpu_mem_info_) delete item.second; +} + +void LegacyMemMonitor::Initialize(const int &device_num) { + for (auto i = 0; i < device_num; ++i) { + gpu_mem_info_[i] = new MemInfo(); + } +} + +void LegacyMemMonitor::Add(const int &device, const size_t &size) { + if (gpu_mem_info_[device]->Add(size)) { + VLOG(3) << "#LegacyMemMonitor# device: " << device + << " peak memory usage : " + << (gpu_mem_info_[device]->GetPeakUsage() >> 20) << " MiB"; + } +} + +void LegacyMemMonitor::Minus(const int &device, const size_t &size) { + gpu_mem_info_[device]->Minus(size); +} + +uint64_t LegacyMemMonitor::GetMemUsage(const int &device) { + return gpu_mem_info_.find(device) == gpu_mem_info_.end() + ? 0 + : gpu_mem_info_[device]->GetPeakUsage(); +} + +void LegacyMemMonitor::PrintMemUsage() { + std::vector devices; + for (const auto &item : gpu_mem_info_) { + devices.emplace_back(item.first); + } + std::sort(devices.begin(), devices.end()); + for (const auto &device : devices) { + std::cout << "Device : " << device << " Peak Memory Usage : " + << (gpu_mem_info_[device]->GetPeakUsage() >> 20) << " MiB" + << std::endl; + } +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h index 503a7a685cb9d8dbbbbd6c23b5b82c383893e3d8..ccbc8c70d8e9a16e7edb1be54bf80bec3b368eca 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.h +++ b/paddle/fluid/memory/allocation/legacy_allocator.h @@ -13,12 +13,59 @@ // limitations under the License. #pragma once +#include +#include // NOLINT +#include +#include +#include #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/place.h" namespace paddle { namespace memory { namespace allocation { +class MemInfo { + public: + MemInfo() : usage_(0), peak_usage_(0) {} + MemInfo(const MemInfo &) = delete; + MemInfo &operator=(const MemInfo &) = delete; + + // return a flag to indicate current operation will create a peak point or not + bool Add(const size_t &); + void Minus(const size_t &); + + uint64_t GetPeakUsage(); + + private: + /* current memory usage*/ + uint64_t usage_; + uint64_t peak_usage_; + std::mutex mutex_; +}; + +class LegacyMemMonitor { + public: + // used to store the GPU memory usage of each devices + using MemUsage = std::unordered_map; + + MemUsage GetMemUsageInfo() { return gpu_mem_info_; } + ~LegacyMemMonitor(); + + void Initialize(const int &); + void Add(const int &, const size_t &); + void Minus(const int &, const size_t &); + + uint64_t GetMemUsage(const int &); + + void PrintMemUsage(); + + protected: + MemUsage gpu_mem_info_; +}; + +extern LegacyMemMonitor GPUMemMonitor; + class LegacyAllocatorPrivate; class LegacyAllocator : public Allocator { public: diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index 6ac3aefdd18d6d9a21dc7ce66511013dfb78bc5b..de81d12cca6ca280289371abdec225c9e2b8f4d0 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -32,7 +32,7 @@ Allocation *CPUPinnedAllocator::AllocateImpl(size_t size, // "CPUPinnedAllocator should be used for Cross-Device Communication"); void *ptr; - PADDLE_ENFORCE(cudaMallocHost(&ptr, size)); + PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); return new CPUPinnedAllocation(ptr, size); } } // namespace allocation diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h index 26d12dd91c7fda31802226a84d883b6a6e9abbe4..42d0938f2afbb1efca8bfdd7035bc0eada30f06b 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.h +++ b/paddle/fluid/memory/allocation/pinned_allocator.h @@ -19,7 +19,7 @@ namespace paddle { namespace memory { namespace allocation { -// Allocator uses `cudaMallocHost` +// Allocator uses `cudaHostAlloc` class CPUPinnedAllocation : public Allocation { public: CPUPinnedAllocation(void *ptr, size_t size) diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 3e8fb83e9d5ba2078bcf37e4a4af74708df9c11c..197d1c2f21fd818879aafe17599bc87d33caa198 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -173,14 +173,14 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { void* p; // PINNED memory is visible to all CUDA contexts. - cudaError_t result = cudaMallocHost(&p, size); + cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable); if (result == cudaSuccess) { *index = 1; // PINNED memory cuda_pinnd_alloc_size_ += size; return p; } else { - LOG(WARNING) << "cudaMallocHost failed."; + LOG(WARNING) << "cudaHostAlloc failed."; return nullptr; } diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 7ec9d2fed53c9c73952db7dcdfc2d8e634f3f84e..189db2317d0544014d9c74e0fd5e9ead54925b9c 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -547,12 +547,14 @@ namespace ops = paddle::operators; __macro(Swish, swish); \ __macro(ThresholdedRelu, thresholded_relu); -#define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE) \ - REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \ - ::paddle::operators::OP_NAME##OpMaker, \ - ::paddle::operators::ActivationOpInferVarType, \ - ::paddle::operators::OP_NAME##GradMaker); \ - REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad) +#define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE) \ + REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \ + ::paddle::operators::OP_NAME##OpMaker, \ + ::paddle::operators::ActivationOpInferVarType, \ + ::paddle::operators::OP_NAME##GradMaker, \ + ::paddle::framework::SingleOpInplaceInToOut); \ + REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad, \ + ::paddle::framework::SingleOpInplaceInToOut) #define REGISTER_ACTIVATION_OP(OP_NAME, KERNEL_TYPE) \ REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \ diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 8b672e09b2c5c203c1a1447fbbd14a45ef7ba257..feac4125381bd897dac89943af44850012e4761d 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -589,8 +589,10 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker { op->SetInput("SavedVariance", Output("SavedVariance")); // used when setting use_global_stats True during training - op->SetInput("Mean", Output("MeanOut")); - op->SetInput("Variance", Output("VarianceOut")); + if (boost::get(GetAttr("use_global_stats"))) { + op->SetInput("Mean", Output("MeanOut")); + op->SetInput("Variance", Output("VarianceOut")); + } op->SetAttrMap(Attrs()); @@ -602,13 +604,48 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker { } }; +class BatchNormInplaceInToOut : public framework::InplaceInToOut { + public: + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map inplace_in_to_out = { + {"Mean", "MeanOut"}, {"Variance", "VarianceOut"}, {"X", "Y"}, + }; + return inplace_in_to_out; + } +}; + +class BatchNormGradInplaceInToOut : public framework::InplaceInToOut { + public: + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map inplace_in_to_out = { + // Scale, Bias, SavedMean, SavedVariance shape is [batch_size, C] + {framework::GradVarName("Y"), framework::GradVarName("X")}, + {"SavedMean", framework::GradVarName("Scale")}, + {"SavedVariance", framework::GradVarName("Bias")}, + }; + return inplace_in_to_out; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, - ops::BatchNormOpInferVarType, ops::BatchNormGradMaker); -REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp); + ops::BatchNormOpInferVarType, ops::BatchNormGradMaker, + ops::BatchNormInplaceInToOut); +REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp, + ops::BatchNormGradInplaceInToOut); REGISTER_OP_CPU_KERNEL( batch_norm, ops::BatchNormKernel, diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index cace42bc1bae93287c330e54d12126efbf9a14bb..f6fbe97565c43c306ea885c765c0a665492fa317 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -31,6 +31,7 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc polygon_box_transform_op.cu) detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) +detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu) detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc) if(WITH_GPU) diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h index b99edb5bf05f94e762b377a8882e4c3fcdb5afad..a7bc3e027229884e78721d29428a8ab3f08a6ebc 100644 --- a/paddle/fluid/operators/detection/bbox_util.h +++ b/paddle/fluid/operators/detection/bbox_util.h @@ -99,5 +99,29 @@ void BboxOverlaps(const framework::Tensor& r_boxes, } } +template +void ClipTiledBoxes(const platform::DeviceContext& ctx, + const framework::Tensor& im_info, + const framework::Tensor& input_boxes, + framework::Tensor* out) { + T* out_data = out->mutable_data(ctx.GetPlace()); + const T* im_info_data = im_info.data(); + const T* input_boxes_data = input_boxes.data(); + T zero(0); + T im_w = round(im_info_data[1] / im_info_data[2]); + T im_h = round(im_info_data[0] / im_info_data[2]); + for (int64_t i = 0; i < input_boxes.numel(); ++i) { + if (i % 4 == 0) { + out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero); + } else if (i % 4 == 1) { + out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero); + } else if (i % 4 == 2) { + out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero); + } else { + out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero); + } + } +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detection/box_clip_op.cc b/paddle/fluid/operators/detection/box_clip_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3aa766559a530bc31fbb277f2bcd474da776e63b --- /dev/null +++ b/paddle/fluid/operators/detection/box_clip_op.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/box_clip_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class BoxClipOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of BoxClipOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), + "Input(ImInfo) of BoxClipOp should not be null."); + + auto input_box_dims = ctx->GetInputDim("Input"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); + + if (ctx->IsRuntime()) { + auto input_box_size = input_box_dims.size(); + PADDLE_ENFORCE_EQ(input_box_dims[input_box_size - 1], 4, + "The last dimension of Input must be 4"); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(Input) in BoxClipOp must be 2"); + PADDLE_ENFORCE_EQ(im_info_dims[1], 3, + "The last dimension of ImInfo must be 3"); + } + ctx->ShareDim("Input", /*->*/ "Output"); + ctx->ShareLoD("Input", /*->*/ "Output"); + } +}; + +class BoxClipOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", + "(LoDTensor) " + "Input is a LoDTensor with shape [..., 4] holds 4 points" + "in last dimension in format [xmin, ymin, xmax, ymax]"); + AddInput("ImInfo", + "(Tensor) Information for image reshape is in shape (N, 3), " + "in format (height, width, im_scale)"); + AddOutput("Output", + "(LoDTensor) " + "Output is a LoDTensor with the same shape as Input" + "and it is the result after clip"); + AddComment(R"DOC( +This operator clips input boxes to original input images. + +For each input box, The formula is given as follows: + + $$xmin = \max(\min(xmin, im_w - 1), 0)$$ + $$ymin = \max(\min(ymin, im_h - 1), 0)$$ + $$xmax = \max(\min(xmax, im_w - 1), 0)$$ + $$ymax = \max(\min(ymax, im_h - 1), 0)$$ + +where im_w and im_h are computed from ImInfo, the formula is given as follows: + + $$im_w = \round(width / im_scale)$$ + $$im_h = \round(height / im_scale)$$ +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(box_clip, ops::BoxClipOp, ops::BoxClipOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + box_clip, ops::BoxClipKernel, + ops::BoxClipKernel); diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..b727da5f7b736b6f22407d1dfbca708ed0cf04d9 --- /dev/null +++ b/paddle/fluid/operators/detection/box_clip_op.cu @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/box_clip_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTenso = framework::LoDTensor; + +static constexpr int ImInfoSize = 3; + +template +static __global__ void GPUBoxClip(const T *input, const size_t *lod, + const size_t width, const T *im_info, + T *output) { + T im_w = round(im_info[blockIdx.x * ImInfoSize + 1] / + im_info[blockIdx.x * ImInfoSize + 2]); + T im_h = round(im_info[blockIdx.x * ImInfoSize] / + im_info[blockIdx.x * ImInfoSize + 2]); + for (int i = threadIdx.x; i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * width; + i += BlockSize) { + int idx = lod[blockIdx.x] * width + i; + T im_size = (idx % 2 == 0) ? im_w : im_h; + output[idx] = max(min(input[idx], im_size - 1), T(0.)); + } +} + +template +class GPUBoxClipKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + auto *input = context.Input("Input"); + auto *im_info = context.Input("ImInfo"); + auto *output = context.Output("Output"); + const int64_t num = input->dims()[0]; + const int64_t bbox_width = input->numel() / num; + auto lod = input->lod(); + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + auto &dev_ctx = context.template device_context(); + auto stream = dev_ctx.stream(); + const size_t batch_size = lod.back().size() - 1; + T *output_data = output->mutable_data(dev_ctx.GetPlace()); + GPUBoxClip<<>>( + input->data(), abs_offset_lod[0].CUDAMutableData(dev_ctx.GetPlace()), + bbox_width, im_info->data(), output_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + box_clip, ops::GPUBoxClipKernel, + ops::GPUBoxClipKernel); diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h new file mode 100644 index 0000000000000000000000000000000000000000..74e1f88f8d8b28e490d170934760bd9bffc807bc --- /dev/null +++ b/paddle/fluid/operators/detection/box_clip_op.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/bbox_util.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class BoxClipKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input_box = context.Input("Input"); + auto* im_info = context.Input("ImInfo"); + auto* output_box = context.Output("Output"); + auto& dev_ctx = + context.template device_context(); + output_box->mutable_data(context.GetPlace()); + if (input_box->lod().size()) { + PADDLE_ENFORCE_EQ(input_box->lod().size(), 1UL, + "Only support 1 level of LoD."); + } + auto box_lod = input_box->lod().back(); + int64_t n = static_cast(box_lod.size() - 1); + for (int i = 0; i < n; ++i) { + Tensor im_info_slice = im_info->Slice(i, i + 1); + Tensor box_slice = input_box->Slice(box_lod[i], box_lod[i + 1]); + Tensor output_slice = output_box->Slice(box_lod[i], box_lod[i + 1]); + ClipTiledBoxes(dev_ctx, im_info_slice, box_slice, &output_slice); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc index fdcff62e1fe59b3a2f4925bdff98632f71220abb..0a51d50e06176e713922837861f2102c9ee8a899 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cc +++ b/paddle/fluid/operators/detection/box_coder_op.cc @@ -38,20 +38,12 @@ class BoxCoderOp : public framework::OperatorWithKernel { "The shape of PriorBox is [N, 4]"); if (ctx->HasInput("PriorBoxVar")) { auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); - PADDLE_ENFORCE( - prior_box_var_dims.size() == 1 || prior_box_var_dims.size() == 2, - "Input(PriorBoxVar) of BoxCoderOp should be 1 or 2."); - if (prior_box_var_dims.size() == 1) { - PADDLE_ENFORCE_EQ( - prior_box_var_dims[0], 4, - "The 1st dimension of Input(PriorBoxVar) should be 4" - "when the rank is 1."); - } else { - PADDLE_ENFORCE_EQ( - prior_box_dims, prior_box_var_dims, - "The dimension of Input(PriorBoxVar) should be equal to" - "the dimension of Input(PriorBox when the rank is 2.)"); - } + PADDLE_ENFORCE(prior_box_var_dims.size() == 2, + "Input(PriorBoxVar) of BoxCoderOp should be 2."); + PADDLE_ENFORCE_EQ( + prior_box_dims, prior_box_var_dims, + "The dimension of Input(PriorBoxVar) should be equal to" + "the dimension of Input(PriorBox) when the rank is 2."); } } diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu index e078af3eb478a8bebc6a7fc6460d169d803a3c4b..19a5bb90fa828899ad6270c051090dd3662aeed8 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cu +++ b/paddle/fluid/operators/detection/box_coder_op.cu @@ -56,10 +56,7 @@ __global__ void EncodeCenterSizeKernel( output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)); output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)); if (prior_box_var_data) { - int prior_var_offset = 0; - if (prior_box_var_size == 2) { - prior_var_offset = col_idx * len; - } + int prior_var_offset = col_idx * len; output[idx * len] /= prior_box_var_data[prior_var_offset]; output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1]; output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2]; @@ -99,10 +96,7 @@ __global__ void DecodeCenterSizeKernel( T box_var_x = T(1), box_var_y = T(1); T box_var_w = T(1), box_var_h = T(1); if (prior_box_var_data) { - int prior_var_offset = 0; - if (prior_box_var_size == 2) { - prior_var_offset = axis == 0 ? col_idx * len : row_idx * len; - } + int prior_var_offset = axis == 0 ? col_idx * len : row_idx * len; box_var_x = prior_box_var_data[prior_var_offset]; box_var_y = prior_box_var_data[prior_var_offset + 1]; box_var_w = prior_box_var_data[prior_var_offset + 2]; diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h index a0b1faf7bdc7001eba2d92b4d03fbaf9feb7bcbb..6d406f8196f9964c85bb94541fa7a7a23857539b 100644 --- a/paddle/fluid/operators/detection/box_coder_op.h +++ b/paddle/fluid/operators/detection/box_coder_op.h @@ -79,10 +79,7 @@ class BoxCoderKernel : public framework::OpKernel { output[offset + 3] = std::log(std::fabs(target_box_height / prior_box_height)); if (prior_box_var) { - int prior_var_offset = 0; - if (prior_box_var->dims().size() == 2) { - prior_var_offset = j * len; - } + int prior_var_offset = j * len; output[offset] /= prior_box_var_data[prior_var_offset]; output[offset + 1] /= prior_box_var_data[prior_var_offset + 1]; output[offset + 2] /= prior_box_var_data[prior_var_offset + 2]; @@ -95,11 +92,12 @@ class BoxCoderKernel : public framework::OpKernel { } } } + template void DecodeCenterSize(const framework::Tensor* target_box, const framework::Tensor* prior_box, const framework::Tensor* prior_box_var, - const bool normalized, const int axis, - const std::vector variance, T* output) const { + const bool normalized, std::vector variance, + T* output) const { int64_t row = target_box->dims()[0]; int64_t col = target_box->dims()[1]; int64_t len = target_box->dims()[2]; @@ -107,19 +105,17 @@ class BoxCoderKernel : public framework::OpKernel { auto* target_box_data = target_box->data(); auto* prior_box_data = prior_box->data(); const T* prior_box_var_data = nullptr; - if (prior_box_var) prior_box_var_data = prior_box_var->data(); + if (var_size == 2) prior_box_var_data = prior_box_var->data(); int prior_box_offset = 0; + T var_data[4] = {1., 1., 1., 1.}; + T* var_ptr = var_data; #ifdef PADDLE_WITH_MKLML #pragma omp parallel for collapse(2) #endif for (int64_t i = 0; i < row; ++i) { for (int64_t j = 0; j < col; ++j) { size_t offset = i * col * len + j * len; - if (axis == 0) { - prior_box_offset = j * len; - } else if (axis == 1) { - prior_box_offset = i * len; - } + prior_box_offset = axis == 0 ? j * len : i * len; T prior_box_width = prior_box_data[prior_box_offset + 2] - prior_box_data[prior_box_offset] + (normalized == false); @@ -133,26 +129,18 @@ class BoxCoderKernel : public framework::OpKernel { T target_box_center_x = 0, target_box_center_y = 0; T target_box_width = 0, target_box_height = 0; - T box_var_x = T(1), box_var_y = T(1); - T box_var_w = T(1), box_var_h = T(1); - if (prior_box_var) { - int prior_var_offset = 0; - if (prior_box_var->dims().size() == 2) { - if (axis == 0) - prior_var_offset = j * len; - else if (axis == 1) - prior_var_offset = i * len; - } - box_var_x = prior_box_var_data[prior_var_offset]; - box_var_y = prior_box_var_data[prior_var_offset + 1]; - box_var_w = prior_box_var_data[prior_var_offset + 2]; - box_var_h = prior_box_var_data[prior_var_offset + 3]; - } else if (!(variance.empty())) { - box_var_x = static_cast(variance[0]); - box_var_y = static_cast(variance[1]); - box_var_w = static_cast(variance[2]); - box_var_h = static_cast(variance[3]); + int prior_var_offset = axis == 0 ? j * len : i * len; + if (var_size == 2) { + std::memcpy(var_ptr, prior_box_var_data + prior_var_offset, + 4 * sizeof(T)); + } else if (var_size == 1) { + var_ptr = reinterpret_cast(variance.data()); } + T box_var_x = *var_ptr; + T box_var_y = *(var_ptr + 1); + T box_var_w = *(var_ptr + 2); + T box_var_h = *(var_ptr + 3); + target_box_center_x = box_var_x * target_box_data[offset] * prior_box_width + prior_box_center_x; @@ -211,8 +199,31 @@ class BoxCoderKernel : public framework::OpKernel { EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, variance, output); } else if (code_type == BoxCodeType::kDecodeCenterSize) { - DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, axis, - variance, output); + if (prior_box_var) { + if (axis == 0) { + DecodeCenterSize<0, 2>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 2>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } else if (!(variance.empty())) { + if (axis == 0) { + DecodeCenterSize<0, 1>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 1>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } else { + if (axis == 0) { + DecodeCenterSize<0, 0>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 0>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } } } }; diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h index ed2f5df80cf4d7a5a44af9b09f3b048b1b14cdb9..3591681fc3f6951dfc8d73e8edce38180b771eaf 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.h +++ b/paddle/fluid/operators/detection/density_prior_box_op.h @@ -52,6 +52,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { step_height = step_h; } int num_priors = 0; + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for reduction(+ : num_priors) +#endif for (size_t i = 0; i < densities.size(); ++i) { num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); } @@ -64,6 +68,17 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { auto e_boxes = framework::EigenTensor::From(*boxes).setConstant(0.0); int step_average = static_cast((step_width + step_height) * 0.5); + std::vector sqrt_fixed_ratios; +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int i = 0; i < fixed_ratios.size(); i++) { + sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i])); + } + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(2) +#endif for (int h = 0; h < feature_height; ++h) { for (int w = 0; w < feature_width; ++w) { T center_x = (w + offset) * step_width; @@ -73,34 +88,25 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { for (size_t s = 0; s < fixed_sizes.size(); ++s) { auto fixed_size = fixed_sizes[s]; int density = densities[s]; + int shift = step_average / density; // Generate density prior boxes with fixed ratios. for (size_t r = 0; r < fixed_ratios.size(); ++r) { - float ar = fixed_ratios[r]; - int shift = step_average / density; - float box_width_ratio = fixed_size * sqrt(ar); - float box_height_ratio = fixed_size / sqrt(ar); + float box_width_ratio = fixed_size * sqrt_fixed_ratios[r]; + float box_height_ratio = fixed_size / sqrt_fixed_ratios[r]; + float density_center_x = center_x - step_average / 2. + shift / 2.; + float density_center_y = center_y - step_average / 2. + shift / 2.; for (int di = 0; di < density; ++di) { for (int dj = 0; dj < density; ++dj) { - float center_x_temp = - center_x - step_average / 2. + shift / 2. + dj * shift; - float center_y_temp = - center_y - step_average / 2. + shift / 2. + di * shift; - e_boxes(h, w, idx, 0) = - (center_x_temp - box_width_ratio / 2.) / img_width >= 0 - ? (center_x_temp - box_width_ratio / 2.) / img_width - : 0; - e_boxes(h, w, idx, 1) = - (center_y_temp - box_height_ratio / 2.) / img_height >= 0 - ? (center_y_temp - box_height_ratio / 2.) / img_height - : 0; - e_boxes(h, w, idx, 2) = - (center_x_temp + box_width_ratio / 2.) / img_width <= 1 - ? (center_x_temp + box_width_ratio / 2.) / img_width - : 1; - e_boxes(h, w, idx, 3) = - (center_y_temp + box_height_ratio / 2.) / img_height <= 1 - ? (center_y_temp + box_height_ratio / 2.) / img_height - : 1; + float center_x_temp = density_center_x + dj * shift; + float center_y_temp = density_center_y + di * shift; + e_boxes(h, w, idx, 0) = std::max( + (center_x_temp - box_width_ratio / 2.) / img_width, 0.); + e_boxes(h, w, idx, 1) = std::max( + (center_y_temp - box_height_ratio / 2.) / img_height, 0.); + e_boxes(h, w, idx, 2) = std::min( + (center_x_temp + box_width_ratio / 2.) / img_width, 1.); + e_boxes(h, w, idx, 3) = std::min( + (center_y_temp + box_height_ratio / 2.) / img_height, 1.); idx++; } } @@ -131,8 +137,14 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { vars->Resize({box_num, static_cast(variances.size())}); auto e_vars = framework::EigenMatrix::From(*vars); - - e_vars = var_et.broadcast(Eigen::DSizes(box_num, 1)); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(2) +#endif + for (int i = 0; i < box_num; ++i) { + for (int j = 0; j < variances.size(); ++j) { + e_vars(i, j) = variances[j]; + } + } vars->Resize(var_dim); boxes->Resize(box_dim); diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index 7e789cd8d9143164c2346b067855eb904e00075f..c6c658236c235f0a6767924026b0a7610071e918 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -18,6 +18,7 @@ namespace ops = paddle::operators; REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add); REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out", "X"); + REGISTER_OP_CPU_KERNEL( elementwise_add, ops::ElementwiseAddKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index fd2a98cb45f446e80a4be1b50e94ee611cd23e62..d04bb8f338a80946e8f1d945f66122f02f526eac 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -250,6 +250,20 @@ class ElemwiseGradKernel : public framework::OpKernel { } }; +class ElementwiseOpInplace : public framework::InplaceInToOut { + public: + using framework::InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + return std::unordered_map{ + {"X", "Out"}, + }; + } +}; + } // namespace operators } // namespace paddle @@ -299,6 +313,7 @@ class ElemwiseGradKernel : public framework::OpKernel { REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \ __ElemwiseOp##op_type##Maker__, \ ::paddle::operators::ElementwiseOpInferVarType, \ - op_type##GradMaker); \ + op_type##GradMaker, \ + ::paddle::operators::ElementwiseOpInplace); \ REGISTER_OPERATOR(op_type##_grad, \ ::paddle::operators::ElementwiseOpExplicitGrad) diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc index 8e80dc0e641c443923076c31e269689b5bc134a7..bb904166c4a19997a57723d9f2e50cc839aae960 100644 --- a/paddle/fluid/operators/flatten_op.cc +++ b/paddle/fluid/operators/flatten_op.cc @@ -267,6 +267,35 @@ class Flatten2GradOp : public framework::OperatorBase { } }; +class FlattenOpInplaceInToOut : public framework::InplaceInToOut { + public: + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map inplace_in_to_out = { + {"X", "Out"}, + }; + return inplace_in_to_out; + } +}; + +class FlattenGradInplaceinToOut : public framework::InplaceInToOut { + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map inplace_in_to_out = { + {framework::GradVarName("Out"), framework::GradVarName("X")}, + }; + return inplace_in_to_out; + } +}; + } // namespace operators } // namespace paddle @@ -275,10 +304,13 @@ USE_OP(reshape); namespace ops = paddle::operators; REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker, ops::FlattenOpInferShape, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape); + paddle::framework::DefaultGradOpDescMaker, + ops::FlattenOpInplaceInToOut); +REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape, + ops::FlattenGradInplaceinToOut); REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker, - ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker); + ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker, + ops::FlattenOpInplaceInToOut); REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp, - ops::Flatten2GradInferShape); + ops::Flatten2GradInferShape, ops::FlattenGradInplaceinToOut); diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc index a35ee8a09ed5ddcc4ac465d200b84358fa65b2f3..e9e2a3b1f5c1c00bb2e95b6171ecd09bfe7a0d21 100644 --- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc @@ -79,17 +79,17 @@ void FusionRepeatedFCReluOpMaker::Make() { } template -static void fc_relu(const T* x, const T* w, const T* b, T* y, int m, int n, - int k) { +static void fc_relu(const T* x, const T* w, const T* b, T* y, + const jit::matmul_attr_t& attr) { auto matmul = - jit::Get, platform::CPUPlace>(k); + jit::Get, platform::CPUPlace>(attr); auto addbias_relu = - jit::Get, platform::CPUPlace>(n); - matmul(x, w, y, m, n, k); + jit::Get, platform::CPUPlace>(attr.n); + matmul(x, w, y, &attr); T* dst = y; - for (int i = 0; i < m; ++i) { - addbias_relu(b, dst, dst, n); - dst += n; + for (int i = 0; i < attr.m; ++i) { + addbias_relu(b, dst, dst, attr.n); + dst += attr.n; } } @@ -107,32 +107,33 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel { auto i_dims = in->dims(); auto w_dims = weights[0]->dims(); - int m = i_dims[0]; - int n = w_dims[1]; - int k = w_dims[0]; - relus[0]->Resize({m, n}); + jit::matmul_attr_t attr; + attr.m = i_dims[0]; + attr.n = w_dims[1]; + attr.k = w_dims[0]; + relus[0]->Resize({attr.m, attr.n}); fc_relu(in->data(), weights[0]->data(), biases[0]->data(), - relus[0]->mutable_data(place), m, n, k); + relus[0]->mutable_data(place), attr); for (int i = 1; i < weight_sz - 1; ++i) { auto i_dims = relus[i - 1]->dims(); auto w_dims = weights[i]->dims(); - int m = i_dims[0]; - int n = w_dims[1]; - int k = w_dims[0]; - relus[i]->Resize({m, n}); + attr.m = i_dims[0]; + attr.n = w_dims[1]; + attr.k = w_dims[0]; + relus[i]->Resize({attr.m, attr.n}); fc_relu(relus[i - 1]->data(), weights[i]->data(), - biases[i]->data(), relus[i]->mutable_data(place), m, n, k); + biases[i]->data(), relus[i]->mutable_data(place), attr); } auto i_dims_last = relus[weight_sz - 2]->dims(); auto w_dims_last = weights[weight_sz - 1]->dims(); - m = i_dims_last[0]; - n = w_dims_last[1]; - k = w_dims_last[0]; + attr.m = i_dims_last[0]; + attr.n = w_dims_last[1]; + attr.k = w_dims_last[0]; fc_relu(relus[weight_sz - 2]->data(), weights[weight_sz - 1]->data(), - biases[weight_sz - 1]->data(), out->mutable_data(place), m, n, - k); + biases[weight_sz - 1]->data(), out->mutable_data(place), + attr); } }; diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc index 00dafdead53bbd4614c70875441c565724fca46d..8c8b079633aacb711aa304ec7016c37c6bec61ce 100644 --- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc +++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc @@ -87,15 +87,18 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { auto x_dims = x->dims(); auto y_dims = y->dims(); - int m = x_dims[0]; - int k = x_dims[1]; - int n = y_dims[1]; - int o_numel = m * n; + jit::matmul_attr_t attr; + attr.m = x_dims[0]; + attr.k = x_dims[1]; + attr.n = y_dims[1]; + int o_numel = attr.m * attr.n; auto vsquare_x = - jit::Get, platform::CPUPlace>(m * k); + jit::Get, platform::CPUPlace>(attr.m * + attr.k); auto vsquare_y = - jit::Get, platform::CPUPlace>(k * n); + jit::Get, platform::CPUPlace>(attr.k * + attr.n); auto vsquare_xy = jit::Get, platform::CPUPlace>(o_numel); auto vsub = @@ -103,7 +106,7 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { auto vscal = jit::Get, platform::CPUPlace>(o_numel); auto matmul = - jit::Get, platform::CPUPlace>(k); + jit::Get, platform::CPUPlace>(attr); const T* x_data = x->data(); const T* y_data = y->data(); @@ -112,12 +115,12 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { T* squared_xy_data = squared_xy->mutable_data(place); T* o_data = out->mutable_data(place); - matmul(x_data, y_data, squared_xy_data, m, n, k); + matmul(x_data, y_data, squared_xy_data, &attr); vsquare_xy(squared_xy_data, squared_xy_data, o_numel); - vsquare_x(x_data, squared_x_data, m * k); - vsquare_y(y_data, squared_y_data, k * n); - matmul(squared_x_data, squared_y_data, o_data, m, n, k); + vsquare_x(x_data, squared_x_data, attr.m * attr.k); + vsquare_y(y_data, squared_y_data, attr.k * attr.n); + matmul(squared_x_data, squared_y_data, o_data, &attr); vsub(squared_xy_data, o_data, o_data, o_numel); vscal(&scalar, o_data, o_data, o_numel); diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 5c5a61f64093802697eb21452267471129c7fcf3..97ddf223aefcdfaf8a488f93a152336c1ed458f4 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -93,6 +93,7 @@ std::vector TestSizes() { template struct BenchFunc { // return this function avg time + // TODO(TJ): clear cache every time double operator()(const typename KernelTuples::func_type tgt, Args... args) { for (int i = 0; i < FLAGS_burning; ++i) { tgt(args...); @@ -172,6 +173,9 @@ void BenchXYZNKernel() { RandomVec(d, y_data); BenchAllImpls, PlaceType>(d, x.data(), y.data(), z_data, d); + // test inplace + BenchAllImpls, PlaceType>(d, x.data(), z_data, + z_data, d); } } @@ -311,8 +315,9 @@ void BenchMatMulKernel() { const T* a_data = a.data(); const T* b_data = b.data(); T* c_data = c.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>(k, a_data, b_data, - c_data, m, n, k); + const jit::matmul_attr_t attr{m, n, k}; + BenchAllImpls, PlaceType>(attr, a_data, b_data, + c_data, &attr); } } } diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index 2ea8f927e1a13867fa2065841fac05e766735237..efc7eb79d36c5cf9fac4ac40db4e2e28cb242e22 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -9,6 +9,7 @@ function(USE_JITKERNEL_GEN TARGET) endfunction() # use gen jitcode kernel by name +USE_JITKERNEL_GEN(kMatMul) USE_JITKERNEL_GEN(kVMul) USE_JITKERNEL_GEN(kVAdd) USE_JITKERNEL_GEN(kVSub) diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc index dee6c7b9d3ee9756c1b11d10d55fdca341cbee85..5da24c359edd2df93333fe0ca8a18cdc7385aadb 100644 --- a/paddle/fluid/operators/jit/gen/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -155,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator { class name##Creator : public JitCodeCreator { \ public: \ bool UseMe(const int& attr) const override { \ - return platform::MayIUse(platform::avx); \ + return platform::MayIUse(platform::avx) && attr <= 1024; \ } \ size_t CodeSize(const int& d) const override { \ return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \ diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h index de6b33f467279124d7acd97709516c31706ec4f9..66a97c1be503b0fa983f9a7ec3b61c986774f16b 100644 --- a/paddle/fluid/operators/jit/gen/blas.h +++ b/paddle/fluid/operators/jit/gen/blas.h @@ -61,6 +61,7 @@ class VXXJitCode : public JitCode { base += "_Vec"; } base += (with_relu_ ? "_Relu" : ""); + base += "_D" + std::to_string(num_); return base.c_str(); } void genCode() override; diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc new file mode 100644 index 0000000000000000000000000000000000000000..ae3858eab20aeb80553d8fcec4088a6632c9c17d --- /dev/null +++ b/paddle/fluid/operators/jit/gen/matmul.cc @@ -0,0 +1,128 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/matmul.h" +#include // offsetof +#include + +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void MatMulJitCode::genCode() { + preCode(); + int block, rest; + const auto groups = packed_groups(n_, k_, &block, &rest); + PADDLE_ENFORCE_GT(groups.front(), 0); + + const int block_len = sizeof(float) * block; + const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1; + const int w_reg_idx = x_reg_idx - 1; + // from packed mov(reg_ptr_wgt, ptr[param_attr + offsetof(matmul_attr_t, + // packed_weight)]); + mov(reg_ptr_wgt, param_y); + size_t z_offset = 0; + size_t wgt_offset = 0; + for (size_t g = 0; g < groups.size(); ++g) { + size_t x_offset = 0; + for (int k = 0; k < k_; ++k) { + vbroadcastss(zmm_t(x_reg_idx), ptr[param_x + x_offset]); + // clean + if (k == 0) { + for (int i = 0; i < groups[g]; ++i) { + vxorps(zmm_t(i), zmm_t(i), zmm_t(i)); + } + } + for (int i = 0; i < groups[g]; ++i) { + vmovups(zmm_t(w_reg_idx), ptr[reg_ptr_wgt + wgt_offset]); + vfmadd231ps(zmm_t(i), zmm_t(w_reg_idx), zmm_t(x_reg_idx)); + wgt_offset += block_len; + } + // last one, save + if (k == k_ - 1) { + for (int i = 0; i < groups[g]; ++i) { + // only rest save should be careful + if (rest != 0 && g == groups.size() - 1 && i == groups[g] - 1) { + break; + } + vmovups(ptr[param_z + z_offset + i * block_len], zmm_t(i)); + } + } + x_offset += sizeof(float); + } + z_offset += block_len * groups[g]; + } + + if (rest != 0) { + // below should refine with mask + int reg_idx = groups.back() - 1; + z_offset = (n_ - rest) * sizeof(float); + int inner_block = 8; + while (rest > 0) { + if (rest >= 8) { + inner_block = 8; + vmovups(ptr[param_z + z_offset], ymm_t(reg_idx)); + // shift zmm of inner_block, change reg_idx if update + } else if (rest >= 4) { + inner_block = 4; + vmovups(ptr[param_z + z_offset], xmm_t(reg_idx)); + } else if (rest >= 2) { + inner_block = 2; + vmovq(ptr[param_z + z_offset], xmm_t(reg_idx)); + } else { + inner_block = 1; + vmovss(ptr[param_z + z_offset], xmm_t(reg_idx)); + } + z_offset += inner_block * sizeof(float); + rest -= inner_block; + } + } + + postCode(); +} + +class MatMulCreator : public JitCodeCreator { + public: + bool UseMe(const matmul_attr_t& attr) const override { + return attr.m == 1 && platform::MayIUse(platform::avx512f) && + attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512; + } + size_t CodeSize(const matmul_attr_t& attr) const override { + int block = YMM_FLOAT_BLOCK; + if (platform::MayIUse(platform::avx512f)) { + block = ZMM_FLOAT_BLOCK; + } + return 96 + 4 * attr.k * (attr.n / block + 1) * 8; + } + std::unique_ptr CreateJitCode( + const matmul_attr_t& attr) const override { + PADDLE_ENFORCE_GT(attr.m, 0); + PADDLE_ENFORCE_GT(attr.n, 0); + PADDLE_ENFORCE_GT(attr.k, 0); + return make_unique(attr, CodeSize(attr)); + } +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator); diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h new file mode 100644 index 0000000000000000000000000000000000000000..626baa8f738bf0395f3c7f1700610d0a9075879b --- /dev/null +++ b/paddle/fluid/operators/jit/gen/matmul.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include // for malloc and free +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +class MatMulJitCode : public JitCode { + public: + explicit MatMulJitCode(const matmul_attr_t& attr, + size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) { + PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet"); + this->genCode(); + } + + virtual const char* name() const { + std::string base = "MatMulJitCode"; + base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" + + std::to_string(k_); + return base.c_str(); + } + void genCode() override; + + private: + int m_, n_, k_; + + reg64_t param_x{abi_param1}; + reg64_t param_y{abi_param2}; + reg64_t param_z{abi_param3}; + reg64_t param_attr{abi_param4}; + reg64_t reg_tmp{rax}; + + reg64_t reg_ptr_wgt{r10}; +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc index 310da0c76f1ab251d788e54f2305f375f3fb4838..3cd5f6554bdc188ce9ea0c0b85c84d032c509600 100644 --- a/paddle/fluid/operators/jit/gen_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -16,6 +16,8 @@ #include #include #include +#include +#include "paddle/fluid/platform/cpu_info.h" DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); @@ -38,6 +40,35 @@ void GenBase::dumpCode(const unsigned char* code) const { } } +std::vector packed_groups(int n, int k, int* block_out, int* rest_out) { + int block; + int max_num_regs; + if (platform::MayIUse(platform::avx512f)) { + block = ZMM_FLOAT_BLOCK; + max_num_regs = 32; + } else { + block = YMM_FLOAT_BLOCK; + max_num_regs = 16; + } + // one for x, one for y, others for z + const int max_used_regs_for_n = max_num_regs - 2; + const int aligned_n = n % block == 0 ? n : (n / block + 1) * block; + const int num_block = aligned_n / block; + const int num_groups = num_block / max_used_regs_for_n; + std::vector groups(num_groups, max_used_regs_for_n); + int rest_num_regs = num_block % max_used_regs_for_n; + if (rest_num_regs != 0) { + groups.push_back(rest_num_regs); + } + if (block_out) { + *block_out = block; + } + if (rest_out) { + *rest_out = n % block; + } + return groups; +} + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h index 4af01a437670aa6a07d370ff23ed2abd369f69a3..d808a332472ae86240cb63356cb417123523366a 100644 --- a/paddle/fluid/operators/jit/gen_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -16,6 +16,7 @@ #include #include // for unique_ptr +#include #include "paddle/fluid/operators/jit/kernel_base.h" DECLARE_bool(dump_jitcode); @@ -67,6 +68,11 @@ class JitCodeCreator : public GenCreator { virtual std::unique_ptr CreateJitCode(const Attr& attr) const = 0; }; +// unify the method of packed groups +// output the packed groups which used in weights, the block size and rest size +std::vector packed_groups(int n, int k, int* block = nullptr, + int* rest = nullptr); + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index 4dac2f2460f72c7da63f48c82549b948cc253153..e7292fe2bd8031aa5bbff68e7c2305a238085bf1 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/operators/jit/helper.h" #include // tolower +#include +#include #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -91,6 +93,41 @@ KernelType to_kerneltype(const std::string& act) { return kNone; } +template <> +void pack_weights(const float* src, float* dst, int n, int k) { + int block, rest; + const auto groups = packed_groups(n, k, &block, &rest); + std::for_each(groups.begin(), groups.end(), [&](int i) { + PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0."); + }); + int sum = std::accumulate(groups.begin(), groups.end(), 0); + std::memset(dst, 0, k * sum * block * sizeof(float)); + PADDLE_ENFORCE_GE(sum * block, n, + "The packed n should be equal to or larger than n"); + + const int block_len = sizeof(float) * block; + int n_offset = 0; + + for (size_t g = 0; g < groups.size(); ++g) { + const float* from = src + n_offset; + for (int j = 0; j < k; ++j) { + size_t copy_sz = groups[g] * block_len; + if (g == groups.size() - 1 && rest != 0) { + copy_sz = (groups[g] - 1) * block_len + rest * sizeof(float); + } + std::memcpy(dst, from + j * n, copy_sz); + dst += groups[g] * block; + } + n_offset += groups[g] * block; + } +} + +template +typename std::enable_if::value>::type pack_weights( + const T* src, T* dst, int n, int k) { + PADDLE_THROW("Only support pack with float type."); +} + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index 7bdc45779b7d39d36db0d52ca9361943cdcdef3e..d5773d65940127ea0a9b77ed2760bd371b778f4c 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -118,26 +118,33 @@ typename KernelTuples::func_type Get( return GetRefer(); } -template -class KernelFuncsCache { +template +class KernelFuncs { public: - KernelFuncsCache() = default; - static KernelFuncsCache& Instance() { - static thread_local KernelFuncsCache g_func_cache; + KernelFuncs() = default; + static KernelFuncs& Cache() { + static thread_local KernelFuncs g_func_cache; return g_func_cache; } bool Has(int key) const { return funcs_.find(key) != funcs_.end(); } - typename KernelTuples::func_type At(int key) { return funcs_.at(key); } - void Insert(int key, typename KernelTuples::func_type func) { funcs_.emplace(key, func); } + typename KernelTuples::func_type At(int key) { + if (Has(key)) { + return funcs_.at(key); + } + auto func = Get(key); + Insert(key, func); + return func; + } + private: std::unordered_map funcs_; - DISABLE_COPY_AND_ASSIGN(KernelFuncsCache); + DISABLE_COPY_AND_ASSIGN(KernelFuncs); }; const char* to_string(KernelType kt); @@ -152,17 +159,28 @@ inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) { << (attr.use_peephole ? "True" : "False") << "]"; return os; } + inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) { os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate) << "],act_cand[" << to_string(attr.act_cand) << "]"; return os; } + inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) { os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type[" << to_string(attr.type) << "]"; return os; } +inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) { + os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]"; + return os; +} + +// expose the method to pack matmul weight +template +void pack_weights(const T* src, T* dst, int n, int k); + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 42a58580f7b1e0832af57398ba9c29882b6cc6fb..4a8f61146a1921fa1d5f6b7e15af40cd45d31a22 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -145,11 +145,19 @@ struct SeqPoolTuples { typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*); }; +typedef struct matmul_attr_s { + int m, n, k; + void* packed_weight{nullptr}; + matmul_attr_s() = default; + explicit matmul_attr_s(int m_, int n_, int k_, void* packed_weight_ = nullptr) + : m(m_), n(n_), k(k_), packed_weight(packed_weight_) {} +} matmul_attr_t; + template struct MatMulTuples { typedef T data_type; - typedef int attr_type; - typedef void (*func_type)(const T*, const T*, T*, int, int, int); + typedef matmul_attr_t attr_type; + typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*); }; template diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index 61de38688664f83775c0c4e5aa6f7e06c3602ddb..1e4a8884e78c5d3c1748988f05ecf461a6f0eb94 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -49,6 +49,13 @@ size_t JitCodeKey(const seq_pool_attr_t& attr) { return (key << pool_type_shift) + static_cast(attr.type); } +template <> +size_t JitCodeKey(const matmul_attr_t& attr) { + size_t key = attr.m; + constexpr int shift = 21; + return (key << shift * 2) + ((static_cast(attr.n)) << shift) + attr.k; +} + } // namespace jit } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index 0f42ac158ca7926981df55936cb903d5f4ae4806..0036d1c238b17768c4df61af22a85588990e1815 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -49,49 +49,16 @@ void VTanh(const T* x, T* y, int n) { } void Softmax(const T* x, T* y, int n, int bs) { - typename XRNTuples::func_type compute_hmax{nullptr}; - typename XRNTuples::func_type compute_hsum{nullptr}; - typename AXYNTuples::func_type compute_vscal{nullptr}; - typename AXYNTuples::func_type compute_vaddbias{nullptr}; - typename XYNTuples::func_type compute_vexp{nullptr}; - - if (!KernelFuncsCache>::Instance().Has(n)) { - compute_hmax = Get, platform::CPUPlace>(n); - KernelFuncsCache>::Instance().Insert(n, compute_hmax); - } else { - compute_hmax = KernelFuncsCache>::Instance().At(n); - } - - if (!KernelFuncsCache>::Instance().Has(n)) { - compute_hsum = Get, platform::CPUPlace>(n); - KernelFuncsCache>::Instance().Insert(n, compute_hsum); - } else { - compute_hsum = KernelFuncsCache>::Instance().At(n); - } - - if (!KernelFuncsCache>::Instance().Has(n)) { - compute_vscal = Get, platform::CPUPlace>(n); - KernelFuncsCache>::Instance().Insert(n, - compute_vscal); - } else { - compute_vscal = KernelFuncsCache>::Instance().At(n); - } - - if (!KernelFuncsCache>::Instance().Has(n)) { - compute_vaddbias = Get, platform::CPUPlace>(n); - KernelFuncsCache>::Instance().Insert( - n, compute_vaddbias); - } else { - compute_vaddbias = - KernelFuncsCache>::Instance().At(n); - } - - if (!KernelFuncsCache>::Instance().Has(n)) { - compute_vexp = Get, platform::CPUPlace>(n); - KernelFuncsCache>::Instance().Insert(n, compute_vexp); - } else { - compute_vexp = KernelFuncsCache>::Instance().At(n); - } + auto compute_hmax = + KernelFuncs, platform::CPUPlace>::Cache().At(n); + auto compute_hsum = + KernelFuncs, platform::CPUPlace>::Cache().At(n); + auto compute_vscal = + KernelFuncs, platform::CPUPlace>::Cache().At(n); + auto compute_vaddbias = + KernelFuncs, platform::CPUPlace>::Cache().At(n); + auto compute_vexp = + KernelFuncs, platform::CPUPlace>::Cache().At(n); for (int i = 0; i < bs; ++i) { T scalar; diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 28a37198dae19a57509934ec784746bc23436e7a..4c999131ab116ebe3484355158993558b02cc4b2 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -25,17 +25,19 @@ namespace more { namespace mkl { template <> -void MatMul(const float* a, const float* b, float* c, int m, int n, - int k) { - platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, - n, k, 1.f, a, k, b, n, 0.f, c, n); +void MatMul(const float* a, const float* b, float* c, + const matmul_attr_t* attr) { + platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + attr->m, attr->n, attr->k, 1.f, a, attr->k, b, + attr->n, 0.f, c, attr->n); } template <> -void MatMul(const double* a, const double* b, double* c, int m, int n, - int k) { - platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, - n, k, 1.0, a, k, b, n, 0.0, c, n); +void MatMul(const double* a, const double* b, double* c, + const matmul_attr_t* attr) { + platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + attr->m, attr->n, attr->k, 1.0, a, attr->k, b, + attr->n, 0.0, c, attr->n); } template <> @@ -127,11 +129,6 @@ void ASum(const double* x, double* res, int n) { } // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 -template <> -bool MatMulKernel::UseMe(const int& d) const { - return platform::MayIUse(platform::avx); -} - template <> bool VMulKernel::UseMe(const int& d) const { return platform::MayIUse(platform::avx512f) && d > 512; @@ -139,7 +136,7 @@ bool VMulKernel::UseMe(const int& d) const { template <> bool VAddKernel::UseMe(const int& d) const { - return platform::MayIUse(platform::avx512f) && d > 512; + return platform::MayIUse(platform::avx) && d > 512; } template <> @@ -177,6 +174,16 @@ bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { return true; } +template <> +bool MatMulKernel::UseMe(const matmul_attr_t& attr) const { + return platform::MayIUse(platform::avx); +} + +template <> +bool MatMulKernel::UseMe(const matmul_attr_t& attr) const { + return true; +} + template <> bool SoftmaxKernel::UseMe(const int& d) const { // tuned on avx2 @@ -189,7 +196,6 @@ bool SoftmaxKernel::UseMe(const int& d) const { return true; \ } -AWALYS_USE_ME_WITH_DOUBLE(MatMul); AWALYS_USE_ME_WITH_DOUBLE(VMul); AWALYS_USE_ME_WITH_DOUBLE(VAdd); AWALYS_USE_ME_WITH_DOUBLE(VScal); diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 6b95b9c872dc12cccaef0b0737edd760447a47d0..8130b87326f1887f232022ab30fa7bf42b0723e7 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -26,7 +26,7 @@ namespace more { namespace mkl { template -void MatMul(const T* a, const T* b, T* c, int m, int n, int k); +void MatMul(const T* a, const T* b, T* c, const matmul_attr_t* attr); template void VMul(const T* x, const T* y, T* z, int n); diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 5a074db7e0e8ab49dc281e1809edef23e6a25c42..0c4a985f8e8ece0a6169478fa3a9b111f5a6f3b4 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -363,17 +363,19 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { // A(M,K) * B(K,N) = C(M,N) template -void MatMul(const T* A, const T* B, T* C, int M, int N, int K) { +void MatMul(const T* A, const T* B, T* C, const matmul_attr_t* attr) { + int M = attr->m; + int N = attr->n; + int K = attr->k; for (int m = 0; m < M; ++m) { const T* pa = A + m * K; T* pc = C + m * N; for (int n = 0; n < N; ++n) { const T* pb = B + n; - T sum = static_cast(0); - for (int k = 0; k < K; ++k) { - sum += (pa[k] * pb[k * N]); + pc[n] = pa[0] * pb[0]; + for (int k = 1; k < K; ++k) { + pc[n] += pa[k] * pb[k * N]; } - *(pc + n) = sum; } } } diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index cc461552898fc68661ce548a520d65215d3572b4..237e588d35cc3b33658a830db34676967818aab6 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -22,7 +22,7 @@ #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/place.h" -static double acc = 1e-5; +DEFINE_double(acc, 1e-5, "Test accuracy threshold."); template void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), @@ -39,7 +39,7 @@ template void ExpectEQ(const T* target, const T* refer, int n) { if (std::is_floating_point::value) { for (int i = 0; i < n; ++i) { - EXPECT_NEAR(target[i], refer[i], acc); + EXPECT_NEAR(target[i], refer[i], FLAGS_acc); } } else { for (int i = 0; i < n; ++i) { @@ -272,21 +272,23 @@ struct TestFuncWithRefer, std::vector, std::vector, template struct TestFuncWithRefer, std::vector, std::vector, - std::vector, int, int, int> { + std::vector, + typename jit::MatMulTuples::attr_type> { void operator()(const typename jit::MatMulTuples::func_type tgt, const std::vector& a, const std::vector& b, - const std::vector& cref, int m, int n, int k) { + const std::vector& cref, + const typename jit::MatMulTuples::attr_type& attr) { EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(a.size(), static_cast(m * k)); - EXPECT_EQ(b.size(), static_cast(k * n)); - EXPECT_EQ(cref.size(), static_cast(m * n)); + EXPECT_EQ(a.size(), static_cast(attr.m * attr.k)); + EXPECT_EQ(b.size(), static_cast(attr.k * attr.n)); + EXPECT_EQ(cref.size(), static_cast(attr.m * attr.n)); std::vector c(cref.size()); const T* a_data = a.data(); const T* b_data = b.data(); const T* cref_data = cref.data(); T* c_data = c.data(); - tgt(a_data, b_data, c_data, m, n, k); - ExpectEQ(c_data, cref_data, m * n); + tgt(a_data, b_data, c_data, &attr); + ExpectEQ(c_data, cref_data, attr.m * attr.n); } }; @@ -383,8 +385,8 @@ void TestAXYNKernel() { template void TestXRNKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - auto last_acc = acc; - acc = 1e-4; + auto last_acc = FLAGS_acc; + FLAGS_acc = 1e-4; for (int d : TestSizes()) { auto ref = jit::GetRefer>(); EXPECT_TRUE(ref != nullptr); @@ -395,7 +397,7 @@ void TestXRNKernel() { TestAllImpls, PlaceType, std::vector, T>(d, x, ref_res); } - acc = last_acc; + FLAGS_acc = last_acc; } template @@ -535,9 +537,10 @@ void TestSeqPoolKernel() { template void TestMatMulKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - auto last_acc = acc; - // TODO(intel): this should be acc issue of MKL - acc = 1e-3; + auto last_acc = FLAGS_acc; + // TODO(intel): fix MKL acc issue + // https://github.com/PaddlePaddle/Paddle/issues/15447 + FLAGS_acc = 1e-3; for (int m : {1, 2, 3, 4}) { for (int n : {1, 2, 3, 4}) { for (int k : TestSizes()) { @@ -549,13 +552,14 @@ void TestMatMulKernel() { const T* a_data = a.data(); const T* b_data = b.data(); T* c_data = c.data(); - ref(a_data, b_data, c_data, m, n, k); + const jit::matmul_attr_t attr{m, n, k}; + ref(a_data, b_data, c_data, &attr); TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(k, a, b, c, m, n, k); + std::vector, std::vector>(attr, a, b, c, attr); } } } - acc = last_acc; + FLAGS_acc = last_acc; } template diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 5c44d044c67f9456d8d69c8c5bf5c6e2b710b68d..d4837696241b8c4e3cca4f2afe872c6be559853c 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -37,7 +37,7 @@ math_library(concat_and_split) math_library(context_project DEPS im2col math_function) math_library(cross_entropy) math_library(cos_sim_functor) -math_library(depthwise_conv) +math_library(depthwise_conv DEPS cub) math_library(im2col) math_library(sample_prob) math_library(sampler) diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index cddd0a18db53a7ddf9ca14d5f373180586ef6a31..0ad57c51be79cd3577b43c9af777bff710308fac 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -30,15 +30,17 @@ inline void FCCompute(const BlasT& blas, const int M, return; } if (relu) { - auto compute = - jit::Get, platform::CPUPlace>(N); + auto compute = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(N); for (int i = 0; i < M; i++) { T* dst = Y + i * N; compute(B, dst, dst, N); } } else { - auto compute = - jit::Get, platform::CPUPlace>(N); + auto compute = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(N); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 1ff9ff684fc8001afb0f768a033b4c5bd1592702..a1cb3f972826a67721b00ce6df0ec48cc34d6e03 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -82,8 +82,9 @@ class SoftmaxFunctor> { const int kClassDim = 1; // 2D data. Batch x C auto compute_softmax = - jit::Get, platform::CPUPlace>( - in_dims[kClassDim]); + jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(in_dims[kClassDim]); compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]); } }; diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc index 13b168ce4553c3377a62d9781b185fa7303c1136..38e65524e870834710ff29f722c69eadf67d9dbe 100644 --- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc +++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc @@ -31,6 +31,9 @@ std::map>>)>> NgraphBridge::NG_NODE_MAP = { + {"accuracy", NG_OPS::BuildAccuracyNode}, + {"conv2d", NG_OPS::BuildConv2dNode}, + {"conv2d_grad", NG_OPS::BuildConv2dGradNode}, {"elementwise_add", NG_OPS::BuildElementwiseAddNode}, {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode}, {"fill_constant", NG_OPS::BuildFillConstantNode}, diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h index 4b7aa3393b40054cb9e51152ea8fc4ac6548c008..fb574f1bc1160c79f5802f11c00716eccad7f48d 100644 --- a/paddle/fluid/operators/ngraph/ngraph_ops.h +++ b/paddle/fluid/operators/ngraph/ngraph_ops.h @@ -21,7 +21,9 @@ limitations under the License. */ #pragma once -#include "ops/binary_unnary_op.h" +#include "ops/accuracy_op.h" +#include "ops/binary_unary_op.h" +#include "ops/conv2d_op.h" #include "ops/elementwise_add_op.h" #include "ops/fill_constant_op.h" #include "ops/mean_op.h" diff --git a/paddle/fluid/operators/ngraph/ops/accuracy_op.h b/paddle/fluid/operators/ngraph/ops/accuracy_op.h new file mode 100644 index 0000000000000000000000000000000000000000..bf37ce48d8c2ce3b97fac154be9d1dfb08421f97 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/accuracy_op.h @@ -0,0 +1,65 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildAccuracyNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto indices = platform::GetInputNode(op, "Indices", ngb_node_map); + auto label = platform::GetInputNode(op, "Label", ngb_node_map); + auto inference = platform::GetInputNode(op, "Out", ngb_node_map); + auto inference_shape = inference->get_shape(); + size_t num_samples = inference_shape.at(0); + size_t k = inference_shape.at(1); + + std::shared_ptr label_k = label; + if (k > 1) { + auto label_1d = std::make_shared( + label, ngraph::AxisVector{0, 1}, ngraph::Shape{num_samples}); + label_k = std::make_shared(label_1d, inference_shape, + ngraph::AxisSet{1}); + } + + auto node_equal = std::make_shared(indices, label_k); + auto node_eq_int = + std::make_shared(node_equal, ngraph::element::i64); + auto num_correct_0d = + std::make_shared(node_eq_int, ngraph::AxisSet{0, 1}); + std::shared_ptr num_correct = + platform::NgReshaper(num_correct_0d, ngraph::Shape{1}); + std::shared_ptr n_samples = ngraph::op::Constant::create( + ngraph::element::i64, ngraph::Shape{1}, {num_samples}); + std::shared_ptr accuracy = std::make_shared( + std::make_shared(num_correct, ngraph::element::f32), + std::make_shared(n_samples, ngraph::element::f32)); + + platform::SetOutputNode(op, "Accuracy", accuracy, ngb_node_map); + platform::SetOutputNode(op, "Correct", num_correct, ngb_node_map); + platform::SetOutputNode(op, "Total", n_samples, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h similarity index 100% rename from paddle/fluid/operators/ngraph/ops/binary_unnary_op.h rename to paddle/fluid/operators/ngraph/ops/binary_unary_op.h diff --git a/paddle/fluid/operators/ngraph/ops/conv2d_op.h b/paddle/fluid/operators/ngraph/ops/conv2d_op.h new file mode 100644 index 0000000000000000000000000000000000000000..46fb2703f51482afa0546f08b8fc7b2c98e281bc --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/conv2d_op.h @@ -0,0 +1,235 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +std::shared_ptr GroupedConvolution( + const std::shared_ptr& data_batch, + const std::shared_ptr& filters, const ngraph::Strides strides, + const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings, + size_t groups) { + auto& data_shape = data_batch->get_shape(); + auto& filter_shape = filters->get_shape(); + ngraph::NodeVector ng_slices; + + for (size_t i = 0; i < groups; ++i) { + size_t channel_step = filter_shape.at(1); + const std::vector lower_bound{0, i * channel_step, 0, 0}; + const std::vector upper_bound{data_shape.at(0), + (i + 1) * channel_step, + data_shape.at(2), data_shape.at(3)}; + auto data_slice = std::make_shared( + data_batch, lower_bound, upper_bound); + + size_t filter_step = filter_shape.at(0) / groups; + const std::vector filter_lower_bound{i * filter_step, 0, 0, 0}; + const std::vector filter_upper_bound{ + (i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2), + filter_shape.at(3)}; + auto filter_slice = std::make_shared( + filters, filter_lower_bound, filter_upper_bound); + auto ng_conv = std::make_shared( + data_slice, filter_slice, strides, dilations, paddings, paddings); + ng_slices.push_back(ng_conv); + } + + size_t concat_axis = 1; + return std::make_shared(ng_slices, concat_axis); +} + +std::shared_ptr GroupedGradConvolutionFilter( + const std::shared_ptr& data_batch, + const std::shared_ptr& filters, + const std::shared_ptr& doutput, const ngraph::Strides strides, + const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings, + size_t groups) { + auto& data_shape = data_batch->get_shape(); + auto& filter_shape = filters->get_shape(); + auto& out_shape = doutput->get_shape(); + ngraph::NodeVector ng_slices; + + for (size_t i = 0; i < groups; ++i) { + size_t channel_step = filter_shape.at(1); + const std::vector lower_bound{0, i * channel_step, 0, 0}; + const std::vector upper_bound{data_shape.at(0), + (i + 1) * channel_step, + data_shape.at(2), data_shape.at(3)}; + auto data_slice = std::make_shared( + data_batch, lower_bound, upper_bound); + + size_t filter_step = data_shape.at(0); + + const std::vector filter_lower_bound{i * filter_step, 0, 0, 0}; + const std::vector filter_upper_bound{ + (i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2), + filter_shape.at(3)}; + auto filter_slice = std::make_shared( + filters, filter_lower_bound, filter_upper_bound); + + const std::vector olower_bound{0, i * filter_step, 0, 0}; + const std::vector oupper_bound{out_shape.at(0), + (i + 1) * filter_step, + out_shape.at(2), out_shape.at(3)}; + auto out_slice = std::make_shared(doutput, olower_bound, + oupper_bound); + + auto ng_conv = std::make_shared( + data_slice, filter_slice->get_shape(), out_slice, strides, dilations, + paddings, paddings, ngraph::Strides{1, 1}); + + ng_slices.push_back(ng_conv); + } + + size_t concat_axis = 0; + return std::make_shared(ng_slices, concat_axis); +} + +std::shared_ptr GroupedGradConvolutionData( + const std::shared_ptr& data_batch, + const std::shared_ptr& filters, + const std::shared_ptr& doutput, const ngraph::Strides strides, + const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings, + size_t groups) { + auto& data_shape = data_batch->get_shape(); + auto& filter_shape = filters->get_shape(); + auto& out_shape = doutput->get_shape(); + ngraph::NodeVector ng_slices; + + for (size_t i = 0; i < groups; ++i) { + size_t channel_step = filter_shape.at(1); + const std::vector lower_bound{0, i * channel_step, 0, 0}; + const std::vector upper_bound{data_shape.at(0), + (i + 1) * channel_step, + data_shape.at(2), data_shape.at(3)}; + auto data_slice = std::make_shared( + data_batch, lower_bound, upper_bound); + + size_t filter_step = data_shape.at(0); + + const std::vector filter_lower_bound{i * filter_step, 0, 0, 0}; + const std::vector filter_upper_bound{ + (i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2), + filter_shape.at(3)}; + auto filter_slice = std::make_shared( + filters, filter_lower_bound, filter_upper_bound); + + const std::vector olower_bound{0, i * filter_step, 0, 0}; + const std::vector oupper_bound{out_shape.at(0), + (i + 1) * filter_step, + out_shape.at(2), out_shape.at(3)}; + auto out_slice = std::make_shared(doutput, olower_bound, + oupper_bound); + + auto ng_conv = std::make_shared( + data_slice->get_shape(), filter_slice, out_slice, strides, dilations, + paddings, paddings, ngraph::Strides{1, 1}); + ng_slices.push_back(ng_conv); + } + + size_t concat_axis = 1; + return std::make_shared(ng_slices, concat_axis); +} + +void BuildConv2dNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + auto filters = paddle::platform::GetInputNode(op, "Filter", ngb_node_map); + auto input = paddle::platform::GetInputNode(op, "Input", ngb_node_map); + + std::vector strides = op_attrs.Get>("strides"); + std::vector paddings = op_attrs.Get>("paddings"); + std::vector dilations = op_attrs.Get>("dilations"); + + const ngraph::Strides ng_strides{static_cast(strides.at(0)), + static_cast(strides.at(1))}; + const ngraph::Strides ng_dilations{static_cast(dilations.at(0)), + static_cast(dilations.at(1))}; + const ngraph::CoordinateDiff ng_paddings{ + static_cast(paddings.at(0)), + static_cast(paddings.at(1))}; + + int groups = static_cast(op_attrs.Get("groups")); + PADDLE_ENFORCE_GE(groups, 1, "conv groups needs be no less than 1"); + + std::shared_ptr result; + if (groups == 1) { + result = std::make_shared( + input, filters, ng_strides, ng_dilations, ng_paddings, ng_paddings); + } else { + result = GroupedConvolution(input, filters, ng_strides, ng_dilations, + ng_paddings, groups); + } + paddle::platform::SetOutputNode(op, "Output", result, ngb_node_map); +} + +void BuildConv2dGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + auto filter = paddle::platform::GetInputNode(op, "Filter", ngb_node_map); + auto input = paddle::platform::GetInputNode(op, "Input", ngb_node_map); + auto doutput = + paddle::platform::GetInputNode(op, "Output@GRAD", ngb_node_map); + + int groups = op_attrs.Get("groups"); + std::vector strides = op_attrs.Get>("strides"); + std::vector paddings = op_attrs.Get>("paddings"); + std::vector dilations = op_attrs.Get>("dilations"); + + const ngraph::Strides ng_strides{static_cast(strides.at(0)), + static_cast(strides.at(1))}; + const ngraph::Strides ng_dilations{static_cast(dilations.at(0)), + static_cast(dilations.at(1))}; + const ngraph::CoordinateDiff ng_paddings{ + static_cast(paddings.at(0)), + static_cast(paddings.at(1))}; + + std::shared_ptr dfilter; + std::shared_ptr dinput; + if (groups == 1) { + dfilter = std::make_shared( + input, filter->get_shape(), doutput, ng_strides, ng_dilations, + ng_paddings, ng_paddings, ngraph::Strides{1, 1}); + + dinput = std::make_shared( + input->get_shape(), filter, doutput, ng_strides, ng_dilations, + ng_paddings, ng_paddings, ngraph::Strides{1, 1}); + + } else { + dfilter = GroupedGradConvolutionFilter(input, filter, doutput, ng_strides, + ng_dilations, ng_paddings, groups); + dinput = GroupedGradConvolutionData(input, filter, doutput, ng_strides, + ng_dilations, ng_paddings, groups); + } + + paddle::platform::SetOutputNode(op, "Filter@GRAD", dfilter, ngb_node_map); + paddle::platform::SetOutputNode(op, "Input@GRAD", dinput, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ops/top_k_op.h b/paddle/fluid/operators/ngraph/ops/top_k_op.h index ea66953a125860ab1ce8309819b6c433ff32eaaa..852ecd7139a3c7046e78265ca021b2ce286c63c0 100644 --- a/paddle/fluid/operators/ngraph/ops/top_k_op.h +++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h @@ -36,11 +36,6 @@ void BuildTopKNode( std::make_shared(top_k, 0); std::shared_ptr out = std::make_shared(top_k, 1); - auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map); - if (dummy_out && dummy_out->get_element_type() != out->get_element_type()) { - out = std::make_shared(out, - dummy_out->get_element_type()); - } paddle::platform::SetOutputNode(op, "Indices", indices, ngb_node_map); paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map); } diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h index 6c95d3f3bf3a3b0448a8f39915f8b025f7d3bd46..f81cbc2c733af2a42f27e2ecb05ee2f8e2f8c17b 100644 --- a/paddle/fluid/operators/norm_op.h +++ b/paddle/fluid/operators/norm_op.h @@ -99,10 +99,10 @@ class NormGradKernel : public framework::OpKernel { auto dx_e = framework::EigenVector::Flatten(*out_dx); Eigen::DSizes shape(pre, n, post); - Eigen::DSizes norm_shape(pre, post); + Eigen::DSizes rshape(pre, 1, post); auto x = x_e.reshape(shape); auto dy = dy_e.reshape(shape); - auto norm = norm_e.reshape(norm_shape); + auto norm = norm_e.reshape(rshape); auto dx = dx_e.reshape(shape); framework::Tensor rsum; @@ -111,7 +111,6 @@ class NormGradKernel : public framework::OpKernel { Eigen::DSizes rdim(1); Eigen::DSizes bcast(1, n, 1); - Eigen::DSizes rshape(pre, 1, post); // dx = ( dy/sqrt(sum(x*x)) ) * [1 - x*sum(x) / (sum(x*x) + e)] // = [dy - dy * x * sum(x) / (sum(x*x) + e)] / sqrt(sum(x*x)) diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 5399ae556e7f38a551d680704d8d825e2fdba88a..fc3636e0b24765f681d3260b07fe854309774a40 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -259,7 +259,7 @@ Example: W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1 $$ - For exclusive = true: + For exclusive = false: $$ hstart = i * strides[0] - paddings[0] hend = hstart + ksize[0] @@ -267,7 +267,7 @@ Example: wend = wstart + ksize[1] Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]} $$ - For exclusive = false: + For exclusive = true: $$ hstart = max(0, i * strides[0] - paddings[0]) hend = min(H, hstart + ksize[0]) @@ -403,7 +403,7 @@ Example: H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\ W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1 $$ - For exclusive = true: + For exclusive = false: $$ dstart = i * strides[0] - paddings[0] dend = dstart + ksize[0] @@ -413,7 +413,7 @@ Example: wend = wstart + ksize[2] Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]} $$ - For exclusive = false: + For exclusive = true: $$ dstart = max(0, i * strides[0] - paddings[0]) dend = min(D, dstart + ksize[0]) diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 26ff221dfa0768bd2bcc9e6485a32485f0212ac6..defc29b91f81cb851fec24c5cd9d62dc72c54147 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/reader/buffered_reader.h" #include +#include "paddle/fluid/framework/data_type.h" namespace paddle { namespace operators { @@ -24,6 +25,13 @@ BufferedReader::~BufferedReader() { position_.front().wait(); position_.pop(); } +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(boost::get(place_).device); + PADDLE_ENFORCE(cudaStreamDestroy(stream)); + for (auto &event : events) PADDLE_ENFORCE(cudaEventDestroy(event)); + } +#endif } BufferedReader::BufferedReader( @@ -33,6 +41,19 @@ BufferedReader::BufferedReader( thread_pool_(1), place_(place), buffer_size_(buffer_size) { +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(boost::get(place_).device); + compute_stream = + ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance() + .Get(place_))) + ->stream(); + events.resize(buffer_size); + for (auto &event : events) + PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + } +#endif cpu_buffer_.resize(buffer_size); gpu_buffer_.resize(buffer_size); ReadTillBufferFullAsync(); @@ -46,6 +67,12 @@ void BufferedReader::ReadTillBufferFullAsync() { } void BufferedReader::ReadAsync(size_t i) { +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(boost::get(place_).device); + PADDLE_ENFORCE(cudaEventRecord(events[i], compute_stream)); + } +#endif position_.emplace(thread_pool_.enqueue([this, i]() -> size_t { TensorVec &cpu = cpu_buffer_[i]; reader_->ReadNext(&cpu); @@ -54,14 +81,41 @@ void BufferedReader::ReadAsync(size_t i) { return -1UL; } +#ifdef PADDLE_WITH_CUDA + // NOTE(liangdun): using async copy instead of TensorCopySync + // TensorCopySync would block other stream if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(boost::get(place_).device); + PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0)); TensorVec &gpu = gpu_buffer_[i]; gpu.resize(cpu.size()); for (size_t i = 0; i < cpu.size(); ++i) { - framework::TensorCopySync(cpu[i], place_, &gpu[i]); + gpu[i].Resize(cpu[i].dims()); + gpu[i].set_layout(cpu[i].layout()); + auto cpu_place = cpu[i].place(); + auto cpu_ptr = cpu[i].data(); + auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type()); + auto size = + cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); + if (platform::is_cuda_pinned_place(cpu_place)) + memory::Copy(boost::get(place_), gpu_ptr, + boost::get(cpu_place), + cpu_ptr, size, stream); + else if ((platform::is_gpu_place(cpu_place))) + memory::Copy(boost::get(place_), gpu_ptr, + boost::get(cpu_place), cpu_ptr, + size, stream); + else + // if cpu place is not pinned, async copy is slower than sync copy, + // so we use sync copy instead. + memory::Copy(boost::get(place_), gpu_ptr, + boost::get(cpu_place), cpu_ptr, size, + 0); gpu[i].set_lod(cpu[i].lod()); } + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); } +#endif return i; })); } diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index cbe2bc1b5fdd69d1a843b768e3289acd621369a6..87680da01a1f51cfdfe4d100508440eda9d1877f 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -19,6 +19,9 @@ #include #include "ThreadPool.h" #include "paddle/fluid/framework/reader.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/gpu_info.h" +#endif namespace paddle { namespace operators { @@ -59,6 +62,11 @@ class BufferedReader : public framework::DecoratedReader { std::vector cpu_buffer_; std::vector gpu_buffer_; size_t prev_pos_{-1UL}; +#ifdef PADDLE_WITH_CUDA + cudaStream_t stream; + cudaStream_t compute_stream; + std::vector events; +#endif }; } // namespace reader diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index f08798794a2f9fc042800583cbc032d6f12bf3dc..43a49de52242b96aade91013e89228fcb3247302 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -213,7 +213,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr reader, framework::LoD lod{lod_data}; lod_tensor.set_lod(lod); int64_t* tensor_data = lod_tensor.mutable_data( - framework::make_ddim({1, static_cast(batch_feasign.size())}), + framework::make_ddim({static_cast(batch_feasign.size()), 1}), platform::CPUPlace()); memcpy(tensor_data, batch_feasign.data(), batch_feasign.size() * sizeof(int64_t)); @@ -223,7 +223,7 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr reader, // insert label tensor framework::LoDTensor label_tensor; auto* label_tensor_data = label_tensor.mutable_data( - framework::make_ddim({1, static_cast(batch_label.size())}), + framework::make_ddim({static_cast(batch_label.size()), 1}), platform::CPUPlace()); memcpy(label_tensor_data, batch_label.data(), batch_label.size() * sizeof(int64_t)); diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 9f3a254c84d4e04fbcd449644a7e138eff520fbc..6410439816d8ae4a9d1df507819071ce76b5308e 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -123,7 +123,7 @@ TEST(CTR_READER, read_data) { std::vector>> data_slot_6003{b1, b2, b3, b4}; - std::vector label_dims = {{1, 3}, {1, 3}, {1, 3}, {1, 1}}; + std::vector label_dims = {{3, 1}, {3, 1}, {3, 1}, {1, 1}}; LoDTensorBlockingQueueHolder queue_holder; int capacity = 64; diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt index 5fe4d15ae2c6254a50318813c852b6c314880aba..ebd07d90ebe6b0ba008ac89c01c4f054f96a6da9 100644 --- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt +++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt @@ -1,5 +1,9 @@ include(operators) -register_operators() +if(WITH_GPU) + register_operators(DEPS cub) +else() + register_operators() +endif() if(WITH_GPU) file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.part.cu") diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 32365d6a9602fa8ad2c01b59c8cd361d52ed973f..eda54f76b898cdf893347d31cadb86dea892a4ce 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -327,14 +327,45 @@ class Reshape2GradOp : public framework::OperatorWithKernel { } }; +class ReshapeOpInplaceInToOut : public framework::InplaceInToOut { + public: + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map inplace_in_to_out = { + {"X", "Out"}, + }; + return inplace_in_to_out; + } +}; + +class ReshapeGradInplaceInToOut : public framework::InplaceInToOut { + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + std::unordered_map inplace_in_to_out = { + {framework::GradVarName("Out"), framework::GradVarName("X")}, + }; + return inplace_in_to_out; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp); + paddle::framework::DefaultGradOpDescMaker, + ops::ReshapeOpInplaceInToOut); +REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp, + ops::ReshapeGradInplaceInToOut); REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int, ops::ReshapeKernel, int64_t, ops::ReshapeKernel); @@ -344,8 +375,9 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, ops::ReshapeGradKernel); REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker, - ops::Reshape2GradMaker); -REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp); + ops::Reshape2GradMaker, ops::ReshapeOpInplaceInToOut); +REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp, + ops::ReshapeGradInplaceInToOut); REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int, ops::ReshapeKernel, int64_t, ops::ReshapeKernel); diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index 981969d2aaa684731a615ec64ca7f7718b35cf09..4ea77ed30db212b694f2050952655dd1a42215bd 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -100,13 +100,14 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker { } }; +using ScaleOpInplace = framework::SingleOpInplaceInToOut; } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker, - ops::ScaleOpVarTypeInference); + ops::ScaleOpVarTypeInference, ops::ScaleOpInplace); REGISTER_OP_CPU_KERNEL( scale, ops::ScaleKernel, ops::ScaleKernel, diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index bc889a5a042a27838ba6ba0fccb187ec11b5f0c5..8fbf299a7c056aff3bfd4cbd3e3cc28fd3c6ccf2 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -198,6 +198,21 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker { return std::unique_ptr(op); } }; + +class SoftmaxInplaceInToOut : public framework::InplaceInToOut { + public: + using framework::InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + return std::unordered_map{ + {"X", "Out"}, + }; + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 1f51b5bab3068cc89bffa85de28a9438359659f3..fbb2ac3fe8c5de9b0be593df225677c6a7a89e9c 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,4 +1,4 @@ -proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto) +proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool) py_proto_compile(profiler_py_proto SRCS profiler.proto) add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) @@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce) -cc_library(place SRCS place.cc DEPS enforce boost) +cc_library(place SRCS place.cc DEPS enforce boost lib_any) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) add_subdirectory(dynload) diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h index 2ce8f141d3c51661305f4952479cf2889fc4f396..31b6c38d613cf9df8fa7e8f6a8e1cfa310280968 100644 --- a/paddle/fluid/platform/cuda_device_function.h +++ b/paddle/fluid/platform/cuda_device_function.h @@ -53,10 +53,12 @@ inline static int RoundToPowerOfTwo(int dim) { __VA_ARGS__; \ } break -#define CUDA_LAUNCH_KERNEL_HELPER(...) \ - CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ +#define CUDA_LAUNCH_KERNEL_HELPER(...) \ + CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); template diff --git a/paddle/fluid/platform/ngraph_helper.h b/paddle/fluid/platform/ngraph_helper.h index c5b65d6636945b85603c07aeae0290ef9cadb396..b84315995a9d8a65668f57eef67f6dab8c20f9b3 100644 --- a/paddle/fluid/platform/ngraph_helper.h +++ b/paddle/fluid/platform/ngraph_helper.h @@ -43,13 +43,14 @@ std::shared_ptr NgReshaper(std::shared_ptr input, std::shared_ptr GetNode( const std::shared_ptr& op, - const std::string prm, const paddle::framework::VariableNameMap& var_map, + const std::string name, const paddle::framework::VariableNameMap& var_map, std::shared_ptr< std::unordered_map>> ngb_node_map) { - auto& var_names = var_map.at(prm); + auto& var_names = var_map.at(name); PADDLE_ENFORCE_EQ(var_names.size(), 1, - "op %s prm %s expects one associated var", op->Type(), prm); + "op %s name %s expects one associated var", op->Type(), + name); if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) { return (*ngb_node_map)[var_names[0]]; } else { @@ -59,43 +60,53 @@ std::shared_ptr GetNode( std::shared_ptr GetInputNode( const std::shared_ptr& op, - const std::string prm, + const std::string name, std::shared_ptr< std::unordered_map>> ngb_node_map) { - return GetNode(op, prm, op->Inputs(), ngb_node_map); + return GetNode(op, name, op->Inputs(), ngb_node_map); } std::shared_ptr GetOutputNode( const std::shared_ptr& op, - const std::string prm, + const std::string name, std::shared_ptr< std::unordered_map>> ngb_node_map) { - return GetNode(op, prm, op->Outputs(), ngb_node_map); + return GetNode(op, name, op->Outputs(), ngb_node_map); } void SetOutputNode( const std::shared_ptr& op, - const std::string prm, std::shared_ptr node, + const std::string name, std::shared_ptr node, std::shared_ptr< std::unordered_map>> ngb_node_map) { - auto& var_names = op->Outputs().at(prm); + auto& var_names = op->Outputs().at(name); if (var_names.size() == 1) { + /* */ + auto dummy_out = GetOutputNode(op, name, ngb_node_map); + if (dummy_out && dummy_out->get_shape() != node->get_shape()) { + node = NgReshaper(node, dummy_out->get_shape()); + } + if (dummy_out && + dummy_out->get_element_type() != node->get_element_type()) { + node = std::make_shared( + node, dummy_out->get_element_type()); + } (*ngb_node_map)[var_names[0]] = node; } else if (var_names.size() == 0) { (*ngb_node_map)[""] = node; } else { - PADDLE_THROW("prm %s has more than 1 var_names.", prm); + PADDLE_THROW("name %s has more than 1 var_names.", name); } } bool HasOutput(const std::shared_ptr& op, - const std::string prm) { + const std::string name) { auto& outputs = op->Outputs(); - if (outputs.find(prm) == outputs.end()) return false; - return outputs.at(prm).size() > 0; + if (outputs.find(name) == outputs.end()) return false; + return outputs.at(name).size() > 0; } inline void GetMidDims(const ngraph::Shape& x_shape, diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 803ea6b26087884ad79c6bf80238953a012eaddc..4ac5b83c56b114f4e3e4c78710716adc636ebe1d 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -26,5 +26,5 @@ if(WITH_PYTHON) get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(paddle_pybind ${os_dependency_modules}) - cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python) + cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python pybind) endif(WITH_PYTHON) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 97e5bbaaccaf7c702a324abd708a314c72ece004..6549229e05de5f2a809b56775d9788bbf8e5c1ae 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -37,6 +37,7 @@ limitations under the License. */ #include "paddle/fluid/framework/version.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" +#include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" @@ -127,6 +128,13 @@ PYBIND11_MODULE(core, m) { m.add_object("_cleanup", py::capsule([]() { ScopePool::Instance().Clear(); })); + m.def("get_mem_usage", [](int device) { + return memory::allocation::GPUMemMonitor.GetMemUsage(device); + }); + + m.def("print_mem_usage", + []() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); }); + py::class_(m, "VarBase", R"DOC()DOC") // .def(py::init<>()) .def(py::init(), py::arg("stop_gradient") = false) @@ -1088,6 +1096,10 @@ All parameter, weight, gradient are variables in Paddle. "memory_early_delete", [](const BuildStrategy &self) { return self.memory_early_delete_; }, [](BuildStrategy &self, bool b) { self.memory_early_delete_ = b; }) + .def_property( + "enable_inplace", + [](const BuildStrategy &self) { return self.enable_inplace_; }, + [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; }) .def("_finalize_strategy_and_create_passes", [](BuildStrategy &self) -> std::shared_ptr { return self.CreatePassesFromStrategy(true); diff --git a/paddle/scripts/fast_install.sh b/paddle/scripts/fast_install.sh new file mode 100644 index 0000000000000000000000000000000000000000..b960d0f00a26196c827053c41a3b35b97e7cdb07 --- /dev/null +++ b/paddle/scripts/fast_install.sh @@ -0,0 +1,923 @@ +#!/bin/bash + +path='http://paddlepaddle.org/download?url=' +#release_version=`curl -s https://pypi.org/project/paddlepaddle/|grep -E "/project/paddlepaddle/"|grep "release"|awk -F '/' '{print $(NF-1)}'|head -1` +release_version=1.2.0 +python_list=( +"27" +"35" +"36" +"37" +) + + +function use_cpu(){ + while true + do + read -p "是否安装CPU版本的PaddlePaddle?(y/n)" cpu_option + cpu_option=`echo $cpu_option | tr 'A-Z' 'a-z'` + if [[ "$cpu_option" == "" || "$cpu_option" == "n" ]];then + echo "退出安装中..." + exit + else + GPU='cpu' + echo "将为您安装CPU版本的PaddlePaddle" + break + fi + done +} + +function checkLinuxCUDNN(){ + echo + read -n1 -p "请按回车键进行下一步..." + echo + while true + do + version_file='/usr/local/cuda/include/cudnn.h' + if [ -f "$version_file" ];then + CUDNN=`cat $version_file | grep CUDNN_MAJOR |awk 'NR==1{print $NF}'` + fi + if [ "$CUDNN" == "" ];then + version_file=`sudo find /usr -name "cudnn.h"|head -1` + if [ "$version_file" != "" ];then + CUDNN=`cat ${version_file} | grep CUDNN_MAJOR -A 2|awk 'NR==1{print $NF}'` + else + echo "检测结果:未在常规路径下找到cuda/include/cudnn.h文件" + while true + do + read -p "请核实cudnn.h位置,并在此输入路径(请注意,路径需要输入到“cudnn.h”这一级):" cudnn_version + echo + if [ "$cudnn_version" == "" ] || [ ! -f "$cudnn_version" ];then + read -p "仍未找到cuDNN,输入y将安装CPU版本的PaddlePaddle,输入n可重新录入cuDNN路径,请输入(y/n)" cpu_option + echo + cpu_option=`echo $cpu_option | tr 'A-Z' 'a-z'` + if [ "$cpu_option" == "y" -o "$cpu_option" == "" ];then + GPU='cpu' + break + else + echo "请重新输入" + echo + fi + else + CUDNN=`cat $cudnn_version | grep CUDNN_MAJOR |awk 'NR==1{print $NF}'` + echo "检测结果:找到cudnn.h" + break + fi + done + if [ "$GPU" == "cpu" ];then + break + fi + fi + fi + if [ "$CUDA" == "9" -a "$CUDNN" != "7" ];then + echo + echo "目前CUDA9下仅支持cuDNN7,暂不支持您机器上的CUDNN${CUDNN}。您可以访问NVIDIA官网下载适合版本的CUDNN,请ctrl+c退出安装进程。按回车键将为您安装CPU版本的PaddlePaddle" + echo + use_cpu() + if [ "$GPU"=="cpu" ];then + break + fi + fi + + if [ "$CUDNN" == 5 ] || [ "$CUDNN" == 7 ];then + echo + echo "您的CUDNN版本是: CUDNN$CUDNN" + break + else + echo + read -n1 -p "目前支持的CUDNN版本为5和7,暂不支持您机器上的CUDNN${CUDNN},将为您安装CPU版本的PaddlePaddle,请按回车键开始安装" + echo + use_cpu + if [ "$GPU"=="cpu" ];then + break + fi + fi + done +} + +function checkLinuxCUDA(){ + while true + do + CUDA=`echo ${CUDA_VERSION}|awk -F "[ .]" '{print $1}'` + if [ "$CUDA" == "" ];then + if [ -f "/usr/local/cuda/version.txt" ];then + CUDA=`cat /usr/local/cuda/version.txt | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'` + tmp_cuda=$CUDA + fi + if [ -f "/usr/local/cuda8/version.txt" ];then + CUDA=`cat /usr/local/cuda8/version.txt | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'` + tmp_cuda8=$CUDA + fi + if [ -f "/usr/local/cuda9/version.txt" ];then + CUDA=`cat /usr/local/cuda9/version.txt | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'` + tmp_cuda9=$CUDA + fi + fi + + if [ "$tmp_cuda" != "" ];then + echo "检测结果:找到CUDA $tmp_cuda" + fi + if [ "$tmp_cudai8" != "" ];then + echo "检测结果:找到CUDA $tmp_cuda8" + fi + if [ "$tmp_cuda9" != "" ];then + echo "检测结果:找到CUDA $tmp_cuda9" + fi + + if [ "$CUDA" == "" ];then + echo "检测结果:没有在常规路径下找到cuda/version.txt文件" + while true + do + read -p "请输入cuda/version.txt的路径:" cuda_version + if [ "$cuda_version" == "" || ! -f "$cuda_version" ];then + read -p "仍未找到CUDA,输入y将安装CPU版本的PaddlePaddle,输入n可重新录入CUDA路径,请输入(y/n)" cpu_option + cpu_option=`echo $cpu_option | tr 'A-Z' 'a-z'` + if [ "$cpu_option" == "y" || "$cpu_option" == "" ];then + GPU='cpu' + break + else + echo "重新输入..." + fi + else + CUDA=`cat $cuda_version | grep 'CUDA Version'|awk -F '[ .]' '{print $3}'` + if [ "$CUDA" == "" ];then + echo "未能在version.txt中找到CUDA相关信息" + else + break + fi + fi + done + if [ "$GPU" == "cpu" ];then + break + fi + fi + + if [ "$CUDA" == "8" ] || [ "$CUDA" == "9" ];then + echo "您的CUDA版本是${CUDA}" + break + else + echo "目前支持CUDA8/9,暂不支持您的CUDA${CUDA},将为您安装CPU版本的PaddlePaddle" + echo + use_cpu + fi + + if [ "$GPU" == "cpu" ];then + break + fi + done +} + +function checkLinuxMathLibrary(){ + while true + do + if [ "$AVX" == "" ];then + echo "正在检测您环境中是否存在AVX指令集..." + echo + echo "检测结果:您电脑上没有AVX指令集,目前针对无AVX指令集的环境,我们仅提供支持mkl数学库的PaddlePaddle,将为您安装此版本的PaddlePaddle" + math='mkl' + break + elif [ "$GPU" == "gpu" ];then + math='mkl' + echo "检测到您的机器上配备GPU,推荐您使用mkl数学库" + break + else + read -p "请输入您希望使用的数学库: + 1:openblas 一个高性能多核 BLAS 库 + 2:mkl(推荐) 英特尔数学核心函数库 + => 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. mkl 】 。请在这里输入并回车:" math + if [ "$math" == "" ];then + math="mkl" + echo "您选择了数字【2】" + break + fi + if [ "$math" == "1" ];then + math=openblas + echo "您选择了数字【1】" + break + elif [ "$math" == "2" ];then + math=mkl + echo "您选择了数字【2】" + break + fi + echo "输入错误,请再次输入" + fi + done +} + +function checkLinuxPaddleVersion(){ + read -n1 -p "请按回车键继续..." + while true + do + read -p " + 1. 开发版:对应Github上develop分支,如您需要开发、或希望使用PaddlePaddle最新功能,请选用此版本 + 2. 稳定版(推荐):如您无特殊开发需求,建议使用此版本,目前最新的版本号为 ${release_version} + => 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. 稳定版 】 。请在这里输入并回车:" paddle_version + if [ "$paddle_version" == "" ];then + paddle_version="release-${release_version}" + echo "您选择了数字【2】,为您安装release-${release_version}" + break + fi + if [ "$paddle_version" == "1" ];then + echo "您选择了数字【1】,将为您安装开发版" + break + elif [ "$paddle_version" == "2" ];then + echo "您选择了数字【2】,为您安装release-${release_version}" + break + fi + echo "输入错误,请再次输入" + done +} + +function checkLinuxPip(){ + while true + do + echo "请输入您要使用的pip目录(您可以另起终端,并使用which pip来查看):" + read -p "" pip_path + if [ "$pip_path" == "" -o ! -f "$pip_path" ];then + echo "检测结果:pip不存在,请重新输入" + continue + fi + python_version=`$pip_path --version|awk -F "[ |)]" '{print $6}'|sed 's#\.##g'` + if [ "$python_version" == "27" ];then + uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27mu"` + if [[ "$uncode" == "" ]];then + uncode= + else + uncode=u + fi + fi + if [ "$python_version" == "" ];then + echo "检测结果:pip不存在,请重新输入" + else + version_list=`echo "${python_list[@]}" | grep "$python_version" ` + if [ "$version_list" != "" ];then + echo "检测结果:找到python${python_version}版本" + break + else + echo "检测结果:找不到可用的 pip, 我们只支持Python27/35/36/37及其对应的pip, 请重新输入, 或使用ctrl + c退出 " + fi + fi + done +} + +function checkLinuxAVX(){ + while true + do + if [[ "$AVX" != "" ]];then + AVX="avx" + break + else + if [ "$CUDA" == "8" -a "$CUDNN" == "7" ] || [ "$GPU" == "cpu" ];then + AVX="noavx" + break + else + echo "Step 6. 检测是否有avx" + echo + echo "检测结果:未能找到avx,我们仅提供CPU版本或配置为CUDA8 cuDNN7的GPU版本的安装包" + break + fi + fi + done +} + +function PipLinuxInstall(){ + wheel_cpu_release="http://paddle-wheel.bj.bcebos.com/${release_version}-${GPU}-${AVX}-${math}/paddlepaddle-${release_version}-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" + wheel_gpu_release="http://paddle-wheel.bj.bcebos.com/${release_version}-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-${release_version}.post${CUDA}${CUDNN}-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" + wheel_gpu_release_noavx="http://paddle-wheel.bj.bcebos.com/${release_version}-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-${release_version}-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" + wheel_cpu_develop="http://paddle-wheel.bj.bcebos.com/latest-cpu-${AVX}-${math}/paddlepaddle-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" + wheel_gpu_develop="http://paddle-wheel.bj.bcebos.com/latest-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" + + + if [[ "$paddle_version" == "2" ]];then + if [[ "$GPU" == "gpu" ]];then + if [[ ${AVX} == "avx" ]];then + rm -rf `echo $wheel_gpu_release|awk -F '/' '{print $NF}'` + wget -q $wheel_gpu_release + if [ "$?" == "0" ];then + $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release + else + echo "paddlepaddle whl包下载失败" + exit 1 + fi + else + rm -rf `echo $wheel_gpu_release_novax|awk -F '/' '{print $NF}'` + wget -q $wheel_gpu_release_novax + if [ "$?" == "0" ];then + $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release_noavx + else + echo "paddlepaddle whl包下载失败" + exit 1 + fi + fi + else + rm -rf `echo $wheel_cpu_release|awk -F '/' '{print $NF}'` + wget -q $wheel_cpu_release + if [ "$?" == "0" ];then + $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release + else + echo "paddlepaddle whl包下载失败" + exit 1 + fi + fi + else + if [[ "$GPU" == "gpu" ]];then + rm -rf `echo $wheel_gpu_develop|awk -F '/' '{print $NF}'` + wget -q $wheel_gpu_develop + if [ "$?" == "0" ];then + $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop + else + echo "paddlepaddle whl包下载失败" + exit 1 + fi + else + rm -rf `echo $wheel_cpu_develop|awk -F '/' '{print $NF}'` + wget -q $wheel_cpu_develop + if [ "$?" == "0" ];then + $pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop + else + echo "paddlepaddle whl包下载失败" + exit 1 + fi + fi + fi +} + + +function checkLinuxGPU(){ + read -n1 -p "即将检测您的机器是否含GPU,请按回车键继续..." + echo + AVX=`cat /proc/cpuinfo |grep avx|tail -1|grep avx` + which nvidia-smi >/dev/null 2>&1 + if [ "$?" != "0" ];then + GPU='cpu' + echo "未在机器上找到GPU,或PaddlePaddle暂不支持此型号的GPU" + else + GPU='gpu' + echo "已在您的机器上找到GPU,即将确认CUDA和CUDNN版本..." + echo + fi + if [ "$GPU" == 'gpu' ];then + checkLinuxCUDA + checkLinuxCUDNN + fi +} + +function linux(){ +gpu_list=( +"GeForce 410M" +"GeForce 610M" +"GeForce 705M" +"GeForce 710M" +"GeForce 800M" +"GeForce 820M" +"GeForce 830M" +"GeForce 840M" +"GeForce 910M" +"GeForce 920M" +"GeForce 930M" +"GeForce 940M" +"GeForce GT 415M" +"GeForce GT 420M" +"GeForce GT 430" +"GeForce GT 435M" +"GeForce GT 440" +"GeForce GT 445M" +"GeForce GT 520" +"GeForce GT 520M" +"GeForce GT 520MX" +"GeForce GT 525M" +"GeForce GT 540M" +"GeForce GT 550M" +"GeForce GT 555M" +"GeForce GT 610" +"GeForce GT 620" +"GeForce GT 620M" +"GeForce GT 625M" +"GeForce GT 630" +"GeForce GT 630M" +"GeForce GT 635M" +"GeForce GT 640" +"GeForce GT 640 (GDDR5)" +"GeForce GT 640M" +"GeForce GT 640M LE" +"GeForce GT 645M" +"GeForce GT 650M" +"GeForce GT 705" +"GeForce GT 720" +"GeForce GT 720M" +"GeForce GT 730" +"GeForce GT 730M" +"GeForce GT 735M" +"GeForce GT 740" +"GeForce GT 740M" +"GeForce GT 745M" +"GeForce GT 750M" +"GeForce GTS 450" +"GeForce GTX 1050" +"GeForce GTX 1060" +"GeForce GTX 1070" +"GeForce GTX 1080" +"GeForce GTX 1080 Ti" +"GeForce GTX 460" +"GeForce GTX 460M" +"GeForce GTX 465" +"GeForce GTX 470" +"GeForce GTX 470M" +"GeForce GTX 480" +"GeForce GTX 480M" +"GeForce GTX 485M" +"GeForce GTX 550 Ti" +"GeForce GTX 560M" +"GeForce GTX 560 Ti" +"GeForce GTX 570" +"GeForce GTX 570M" +"GeForce GTX 580" +"GeForce GTX 580M" +"GeForce GTX 590" +"GeForce GTX 650" +"GeForce GTX 650 Ti" +"GeForce GTX 650 Ti BOOST" +"GeForce GTX 660" +"GeForce GTX 660M" +"GeForce GTX 660 Ti" +"GeForce GTX 670" +"GeForce GTX 670M" +"GeForce GTX 670MX" +"GeForce GTX 675M" +"GeForce GTX 675MX" +"GeForce GTX 680" +"GeForce GTX 680M" +"GeForce GTX 680MX" +"GeForce GTX 690" +"GeForce GTX 750" +"GeForce GTX 750 Ti" +"GeForce GTX 760" +"GeForce GTX 760M" +"GeForce GTX 765M" +"GeForce GTX 770" +"GeForce GTX 770M" +"GeForce GTX 780" +"GeForce GTX 780M" +"GeForce GTX 780 Ti" +"GeForce GTX 850M" +"GeForce GTX 860M" +"GeForce GTX 870M" +"GeForce GTX 880M" +"GeForce GTX 950" +"GeForce GTX 950M" +"GeForce GTX 960" +"GeForce GTX 960M" +"GeForce GTX 965M" +"GeForce GTX 970" +"GeForce GTX 970M" +"GeForce GTX 980" +"GeForce GTX 980M" +"GeForce GTX 980 Ti" +"GeForce GTX TITAN" +"GeForce GTX TITAN Black" +"GeForce GTX TITAN X" +"GeForce GTX TITAN Z" +"Jetson TK1" +"Jetson TX1" +"Jetson TX2" +"Mobile Products" +"NVIDIA NVS 310" +"NVIDIA NVS 315" +"NVIDIA NVS 510" +"NVIDIA NVS 810" +"NVIDIA TITAN V" +"NVIDIA TITAN X" +"NVIDIA TITAN Xp" +"NVS 4200M" +"NVS 5200M" +"NVS 5400M" +"Quadro 410" +"Quadro GP100" +"Quadro K1100M" +"Quadro K1200" +"Quadro K2000" +"Quadro K2000D" +"Quadro K2100M" +"Quadro K2200" +"Quadro K2200M" +"Quadro K3100M" +"Quadro K4000" +"Quadro K4100M" +"Quadro K420" +"Quadro K4200" +"Quadro K4200M" +"Quadro K5000" +"Quadro K500M" +"Quadro K5100M" +"Quadro K510M" +"Quadro K5200" +"Quadro K5200M" +"Quadro K600" +"Quadro K6000" +"Quadro K6000M" +"Quadro K610M" +"Quadro K620" +"Quadro K620M" +"Quadro M1000M" +"Quadro M1200" +"Quadro M2000" +"Quadro M2000M" +"Quadro M2200" +"Quadro M3000M" +"Quadro M4000" +"Quadro M4000M" +"Quadro M5000" +"Quadro M5000M" +"Quadro M500M" +"Quadro M520" +"Quadro M5500M" +"Quadro M6000" +"Quadro M6000 24GB" +"Quadro M600M" +"Quadro M620" +"Quadro Mobile Products" +"Quadro P1000" +"Quadro P2000" +"Quadro P3000" +"Quadro P400" +"Quadro P4000" +"Quadro P5000" +"Quadro P600" +"Quadro P6000" +"Quadro Plex 7000" +"Tegra K1" +"Tegra X1" +"Tesla C2050/C2070" +"Tesla C2075" +"Tesla Data Center Products" +"Tesla K10" +"Tesla K20" +"Tesla K40" +"Tesla K80" +"Tesla M40" +"Tesla M60" +"Tesla P100" +"Tesla P4" +"Tesla P40" +"Tesla V100") + + echo "Step 2. 检测GPU型号和CUDA/cuDNN版本" + echo + checkLinuxGPU + echo + echo "Step 3. 检测数学库" + echo + checkLinuxMathLibrary + echo + echo "Step 4. 选择要安装的PaddlePaddle版本" + echo + checkLinuxPaddleVersion + echo + echo "Step 5. 检测pip版本" + echo + checkLinuxPip + echo + checkLinuxAVX + echo "*********************2. 开始安装*****************************" + PipLinuxInstall +} + +function checkMacPython2(){ + while true + do + read -p " + => 未能在常规路径下找到Python2,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载安装Python2(注意Python版本不能低于2.7.15) + 如希望自定义Python路径,请输入路径:" python_root + echo + python_version=`$python_root --version 2>&1 1>&1` + if [ $? == "0" ];then + : + else + python_version="" + fi + check_python=`echo $python_version | grep "Python 2"` + if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then + python_version="" + elif [ -n "$check_python" ];then + while true + do + read -p " + => 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: " use_python + echo + use_python=`echo $use_python | tr 'A-Z' 'a-z'` + if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then + use_python="y" + break + elif [ "$use_python" == "n" ];then + python_root="" + break + else + echo "输入错误,请重新输入(y/n)" + fi + done + if [ "$use_python" == "y" ];then + break + fi + else + echo "您输入Python的不是Python2" + python_version="" + fi + done +} + +function checkMacPython3(){ + while true + do + read -p " + => 未能在常规路径下找到Python3,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载Python3 + 如希望自定义Python路径,请输入路径:" python_root + python_version=`$python_root --version 2>&1 1>&1` + if [ $? == "0" ];then + : + else + python_version="" + fi + check_python=`echo $python_version | grep "Python 3"` + if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then + python_version="" + elif [ -n "$check_python" ] ;then + while true + do + read -p " + => 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: " use_python + echo + use_python=`echo $use_python | tr 'A-Z' 'a-z'` + if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then + use_python="y" + break + elif [ "$use_python" == "n" ];then + python_root="" + break + else + echo "输入错误,请重新输入(y/n)" + fi + done + if [ "$use_python" == "y" ];then + break + fi + else + echo "您输入Python的不是Python3" + python_version="" + fi + done +} + +function checkMacPaddleVersion(){ + while true + do + read -n1 -p "Step 2. 选择PaddlePaddle的版本,请按回车键继续..." + echo + read -p " + 1. 开发版:对应Github上develop分支,如您需要开发、或希望使用PaddlePaddle最新功能,请选用此版本 + 2. 稳定版(推荐):如您无特殊开发需求,建议使用此版本,目前最新的版本号为 ${release_version} + + => 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. 稳定版 】 。请在这里输入并回车:" paddle_version + if [ "$paddle_version" == "1" ]||[ "$paddle_version" == "2" ];then + echo + echo "您选择了数字【"$paddle_version" 】" + echo + break + else + paddle_version="2" + echo + echo "您选择了数字【2】" + echo + break + fi + done +} + +function checkMacPythonVersion(){ + while true + do + read -n1 -p "Step 3. 选择Python版本,请按回车键继续..." + read -p " + 2. 使用python 2.x + 3. 使用python 3.x + + => 请输入数字2或3。如输入其他字符或直接回车,将会默认使用【Python 2 】。请在这里输入并回车:" python_V + echo + if [ "$python_V" == "" ];then + python_V="2" + fi + echo "您选择了数字【"$python_V"】,正在寻找符合您要求的Python版本,请按回车键继续..." + echo + if [ "$python_V" == "2" ];then + python_root=`which python2.7` + if [ "$python_root" == "" ];then + python_root=`which python` + fi + python_version=`$python_root --version 2>&1 1>&1` + if [ $? == "0" ];then + : + else + python_version="" + fi + if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ]||[ "$python_root" == "/usr/bin/python2.7" -a "$python_version" == "Python 2.7.10" ];then + checkMacPython2 + fi + while true + do + read -p " + => 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车:" use_python + echo + use_python=`echo $use_python | tr 'A-Z' 'a-z'` + if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then + break + elif [ "$use_python" == "n" ];then + python_root="" + checkMacPython2 + break + else + echo "输入错误,请重新输入(y/n)" + fi + done + + elif [ "$python_V" == "3" ];then + python_root=`which python3` + python_version=`$python_root --version 2>&1 1>&1` + if [ $? == "0" ];then + : + else + python_version="" + fi + if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ];then + checkMacPython3 + fi + while true + do + read -p " + => 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车:" use_python + echo + use_python=`echo $use_python | tr 'A-Z' 'a-z'` + if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then + break + elif [ "$use_python" == "n" ];then + checkMacPython3 + break + else + echo "输入错误,请重新输入(y/n)" + fi + done + else + : + fi + + + if [ "$python_V" == "2" ]||[ "$python_V" == "3" ];then + python_brief_version=`$python_root -m pip -V |awk -F "[ |)]" '{print $6}'|sed 's#\.##g'` + if [[ $python_brief_version == "27" ]];then + uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"` + if [[ $uncode == "" ]];then + uncode="mu" + else + uncode="m" + fi + fi + version_list=`echo "${python_list[@]}" | grep "$python_brief_version" ` + if [ "$version_list" != "" ];then + break + else + echo "未找到可用的pip或pip3。PaddlePaddle目前支持:Python2.7/3.5/3.6/3.7及其对应的pip, 请重新输入,或使用ctrl + c退出" + fi + else + echo "输入错误,请重新输入" + fi + done +} + +function checkMacAVX(){ + read -n1 -p "Step 4. 检测您的Mac是否支持AVX指令集,请按回车键继续..." + echo + if [[ $AVX != "" ]];then + AVX="avx" + echo "检测结果:支持" + else + read -n1 -p "检测结果:不支持。非常抱歉,PaddlePaddle在Mac系统暂不提供no_avx类型的安装包,您可以选择在Linux系统中安装no_avx版的PaddlePaddle, 请按回车键退出..." + exit + fi + echo +} + +function checkMacGPU(){ + read -n1 -p "Step 5. 选择CPU/GPU版本,请按回车键继续..." + echo + if [[ $GPU != "" ]];then + echo "MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle" + else + echo "MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle" + GPU=cpu + fi + echo +} + +function macos() { + path='http://paddlepaddle.org/download?url=' + AVX=`sysctl -a | grep cpu | grep AVX1.0 | tail -1 | grep AVX` + + while true + do + checkMacPaddleVersion + checkMacPythonVersion + checkMacAVX + checkMacGPU + + + echo "*********************2. 开始安装*****************************" + echo + read -n1 -p "即将为您下载并安装PaddlePaddle,请按回车键继续..." + echo + if [[ $paddle_version == "2" ]];then + $python_root -m pip install paddlepaddle + if [ $? == "0" ];then + echo "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器" + break + else + rm $whl_cpu_release + echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用" + echo"" + echo "==========================================================================================" + echo"" + exit 1 + fi + else + if [ -f $whl_cpu_develop ];then + $python_root -m pip install $whl_cpu_develop + if [ $? == "0" ];then + rm -rf $whl_cpu_develop + echo "安装成功!小提示:可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器" + break + else + echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用" + echo"" + echo "==========================================================================================" + echo"" + exit 1 + fi + else + wget ${path}$whl_cpu_develop -O $whl_cpu_develop + if [ $? == "0" ];then + $python_root -m pip install $whl_cpu_develop + if [ $? == "0" ];then + rm $wheel_cpu_develop + echo "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器" + break + else + rm $whl_cpu_release + echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用" + echo"" + echo "==========================================================================================" + echo"" + exit 1 + fi + else + rm $whl_cpu_develop + echo "未能正常安装PaddlePaddle,请检查您的网络 或者确认您是否安装有 wget,或者ctrl + c退出后反馈至https://github.com/PaddlePaddle/Paddle/issues" + echo"" + echo "==========================================================================================" + echo"" + exit 1 + fi + fi + fi + done +} + +function main() { + echo "*********************************" + echo "欢迎使用PaddlePaddle快速安装脚本" + echo "*********************************" + echo + echo "如果您在安装过程中遇到任何问题,请在https://github.com/PaddlePaddle/Paddle/issues反馈,我们的工作人员将会帮您答疑解惑" + echo + echo "本安装包将帮助您在Linux或Mac系统下安装PaddlePaddle,包括 1)安装前的准备和 2)开始安装 两部分" + echo + read -n1 -p "请按回车键进行下一步..." + echo + echo + echo "*********************1. 安装前的准备*****************************" + echo + echo "Step 1. 正在检测您的操作系统信息..." + echo + SYSTEM=`uname -s` + if [ "$SYSTEM" == "Darwin" ];then + echo "您的系统为:MAC OSX" + echo + macos + else + echo "您的系统为:Linux" + echo + OS=`cat /etc/issue|awk 'NR==1 {print $1}'` + if [ $OS == "\S" ] || [ "$OS" == "CentOS" ] || [ $OS == "Ubuntu" ];then + linux + else + echo "您的系统不在本安装包的支持范围,如您需要在windows环境下安装PaddlePaddle,请您参考PaddlePaddle官网的windows安装文档" + fi + fi +} +main diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 59e695e6fcb66cbaed1bcc9e861df81b5f73c1ed..90b8fd1a0aab159eb1a829d67485c845182d295b 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -54,7 +54,7 @@ ELSE(WIN32) DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) ENDIF() -set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS}) +set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS} ${external_project_dependencies}) add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps}) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 896d98c97f961d7923b009833be2a531412ddd77..6fa0de847c8d8cda85689714376304807edc4f26 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -158,7 +158,8 @@ def __bootstrap__(): 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus', 'sync_nccl_allreduce', 'limit_of_tmp_allocation', - 'times_excess_than_required_tmp_allocation' + 'times_excess_than_required_tmp_allocation', + 'enable_inplace_whitelist' ] core.init_gflags([sys.argv[0]] + diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index a35a4c59835e2a64a11ae156bed34d4b35696f73..ef0242942838fcca737a10fafbafa61bf520b532 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -174,6 +174,11 @@ class CompiledProgram(object): self._exec_strategy.num_threads = cpu_num * 2 trainers_endpoints = self._program._trainers_endpoints + + # FIXME(dzhwinter): enable_inplace should be after memory_optimize + # if turn on python memory optimize, turn off the inplace_pass. + self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True + if self._build_strategy.num_trainers > 1 and trainers_endpoints: assert self._build_strategy.num_trainers == len( trainers_endpoints), "num_trainers == len(end_points)" diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 96587b6e904f681a71182ffdb03608b5edde5e46..c0b0ad8a202b82183de9ec1edd43cb10db10fb5c 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1725,6 +1725,19 @@ class Program(object): self._trainers_endpoints = [] # the distributed lookup table names self._distributed_lookup_table = None + # @deprecated(the python memory optimize transpiler is deprecated) + # whether the program is optimized by memory_optimize_transpiler + self.__is_mem_optimized = False + + @property + def _is_mem_optimized(self): + # if the program is optimized, operator input/outputs + # maybe same, which conflict with save_inference_model. + return self.__is_mem_optimized + + @_is_mem_optimized.setter + def _is_mem_optimized(self, target): + self.__is_mem_optimized = target @property def op_role(self): @@ -1744,7 +1757,7 @@ class Program(object): return self._current_role @op_role.setter - def set_op_role(self, role): + def op_role(self, role): self._current_role = role @property diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 6b1d4cc34f3cd40c878740f28618f26d5e89a6bd..a2abbf36c0267d85c9c97af00c9faabf1187822c 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -16,14 +16,16 @@ from __future__ import print_function import os import errno +import warnings import time import shutil import six from functools import reduce +from paddle.fluid import layers from paddle.fluid.executor import Executor from paddle.fluid.evaluator import Evaluator -from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable +from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable, program_guard from . import core __all__ = [ @@ -930,6 +932,24 @@ def save_inference_model(dirname, if main_program is None: main_program = default_main_program() + if main_program._is_mem_optimized: + warnings.warn( + "save_inference_model must put before you call memory_optimize. \ + the memory_optimize will modify the original program, \ + is not suitable for saving inference model \ + we save the original program as inference model.", + RuntimeWarning) + + # fix the bug that the activation op's output as target will be pruned. + # will affect the inference performance. + # TODO(Superjomn) add an IR pass to remove 1-scale op. + with program_guard(main_program): + uniq_target_vars = [] + for var in target_vars: + if isinstance(var, Variable): + var1 = layers.scale(var, 1.) + uniq_target_vars.append(var1) + target_vars = uniq_target_vars # when a pserver and a trainer running on the same machine, mkdir may conflict try: diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 0602d7a19481fbf0210a7cb4bd15a1033b0e8900..3b43ae0b9cb63a9f4708a680cb1021d74c197550 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -49,6 +49,7 @@ __all__ = [ 'box_coder', 'polygon_box_transform', 'yolov3_loss', + 'box_clip', 'multiclass_nms', ] @@ -396,10 +397,10 @@ def box_coder(prior_box, input is image feature map, they are close to the origin of the coordinate system. [xmax, ymax] is the right bottom coordinate of the anchor box. - prior_box_var(Variable|list): prior_box_var supports two types of input. - One is variable with shape [M, 4] holds M group. - The other one is list consist of 4 elements - shared by all boxes. + prior_box_var(Variable|list|None): prior_box_var supports two types + of input. One is variable with shape [M, 4] + holds M group. The other one is list consist of + 4 elements shared by all boxes. target_box(Variable): This input can be a 2-D LoDTensor with shape [N, 4] when code_type is 'encode_center_size'. This input also can be a 3-D Tensor with shape @@ -2055,6 +2056,54 @@ def generate_proposals(scores, return rpn_rois, rpn_roi_probs +def box_clip(input, im_info, name=None): + """ + Clip the box into the size given by im_info + For each input box, The formula is given as follows: + + .. code-block:: text + + xmin = max(min(xmin, im_w - 1), 0) + ymin = max(min(ymin, im_h - 1), 0) + xmax = max(min(xmax, im_w - 1), 0) + ymax = max(min(ymax, im_h - 1), 0) + + where im_w and im_h are computed from im_info: + + .. code-block:: text + + im_h = round(height / scale) + im_w = round(weight / scale) + + Args: + input(variable): The input box, the last dimension is 4. + im_info(variable): The information of image with shape [N, 3] with + layout (height, width, scale). height and width + is the input size and scale is the ratio of input + size and original size. + name (str): The name of this layer. It is optional. + + Returns: + Variable: The cliped tensor variable. + + Examples: + .. code-block:: python + + boxes = fluid.layers.data( + name='data', shape=[8, 4], dtype='float32', lod_level=1) + im_info = fluid.layers.data(name='im_info', shape=[3]) + out = fluid.layers.box_clip( + input=boxes, im_info=im_info, inplace=True) + """ + + helper = LayerHelper("box_clip", **locals()) + output = helper.create_variable_for_type_inference(dtype=input.dtype) + inputs = {"Input": input, "ImInfo": im_info} + helper.append_op(type="box_clip", inputs=inputs, outputs={"Output": output}) + + return output + + def multiclass_nms(bboxes, scores, score_threshold, @@ -2132,9 +2181,11 @@ def multiclass_nms(bboxes, (After version 1.3, when no boxes detected, the lod is changed from {0} to {1}) + Examples: .. code-block:: python + boxes = fluid.layers.data(name='bboxes', shape=[81, 4], dtype='float32', lod_level=1) scores = fluid.layers.data(name='scores', shape=[81], diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index a07ff6ac69ca20c8c68659a67606076ce8cdf027..52b260efd15066a114a8146106685043654c91ea 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -146,6 +146,9 @@ class ParallelExecutor(object): # step4: get main_program, scope, local_scopes main = main_program if main_program \ else framework.default_main_program() + # FIXME(dzhwinter): enable_inplace should be after memory_optimize + # if turn on python memory optimize, turn off the inplace_pass. + build_strategy.enable_inplace = False if main._is_mem_optimized else True scope = scope if scope is not None else executor.global_scope() if share_vars_from and not isinstance(share_vars_from, diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 77dfa1cb519db3faa9ef8b7b27f7a39b5d31f2a8..0d39a139eed87f900b1f59fd0569b6acaec0962b 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -482,6 +482,17 @@ class TestYoloDetection(unittest.TestCase): self.assertIsNotNone(loss) +class TestBoxClip(unittest.TestCase): + def test_box_clip(self): + program = Program() + with program_guard(program): + input_box = layers.data( + name='input_box', shape=[7, 4], dtype='float32', lod_level=1) + im_info = layers.data(name='im_info', shape=[3], dtype='float32') + out = layers.box_clip(input_box, im_info) + self.assertIsNotNone(out) + + class TestMulticlassNMS(unittest.TestCase): def test_multiclass_nms(self): program = Program() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 699181d01da862dca72113e6c11630ae5693e41c..4b26bacce968a6da72e9aa043adb38918b293a35 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -110,6 +110,10 @@ py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executo if(NOT APPLE) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) endif() +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + # change the timeout from 600 to 900, because in debug mode, this test need more time. + set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 900) +endif() if (WITH_NGRAPH) add_subdirectory(ngraph) diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py new file mode 100644 index 0000000000000000000000000000000000000000..13a33e20478372af370d38ab2b475e4425dc8d6e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py @@ -0,0 +1,30 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle.fluid.core as core +from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.test_accuracy_op import TestAccuracyOp + + +class TestNGRAPHAccuracyOp(TestAccuracyOp): + def setUp(self): + super(TestNGRAPHAccuracyOp, self).setUp() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py new file mode 100644 index 0000000000000000000000000000000000000000..e5424e8a6e615820b4a1a5f2ee7e7e87dd0b22af --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py @@ -0,0 +1,52 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +from paddle.fluid.tests.unittests.test_conv2d_op import * + + +class TestNGRAPH(TestConv2dOp): + def init_kernel_type(self): + super(TestNGRAPH, self).init_kernel_type() + + +class TestNGRAPHWithPad(TestWithPad): + def init_kernel_type(self): + super(TestNGRAPHWithPad, self).init_kernel_type() + + +class TestNGRAPHWithStride(TestWithStride): + def init_kernel_type(self): + super(TestNGRAPHWithStride, self).init_kernel_type() + + +class TestNGRAPHWithGroup(TestWithGroup): + def init_kernel_type(self): + super(TestNGRAPHWithGroup, self).init_kernel_type() + + +class TestNGRAPHWith1x1(TestWith1x1): + def init_kernel_type(self): + super(TestNGRAPHWith1x1, self).init_kernel_type() + + +class TestNGRAPHWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): + def init_kernel_type(self): + super(TestNGRAPHWithInput1x1Filter1x1, self).init_kernel_type() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index fdacd241f9e1f8d442f55098e2d192a3d57fdaf1..c429c8af7d37cb4e209edc41f704868afe054829 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -40,7 +40,8 @@ class TestParallelExecutorBase(unittest.TestCase): seed=None, use_parallel_executor=True, use_reduce=False, - use_ir_memory_optimize=False, + use_ir_memory_optimize=True, + enable_inplace=True, fuse_elewise_add_act_ops=False, fuse_relu_depthwise_conv=False, optimizer=fluid.optimizer.Adam, @@ -60,63 +61,65 @@ class TestParallelExecutorBase(unittest.TestCase): main.random_seed = seed loss = method(use_feed=feed_dict is not None) - if optimizer: optimizer().minimize(loss) if memory_opt: fluid.memory_optimize(main) - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup) - exec_strategy = fluid.ExecutionStrategy() - exec_strategy.allow_op_delay = allow_op_delay - if use_fast_executor: - exec_strategy.use_experimental_executor = True - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ - if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce - build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops - build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv - build_strategy.memory_optimize = use_ir_memory_optimize - build_strategy.enable_sequential_execution = enable_sequential_execution - if use_cuda and core.is_compiled_with_cuda(): - build_strategy.remove_unnecessary_lock = True - if use_parallel_executor: - binary = compiler.CompiledProgram(main).with_data_parallel( - loss_name=loss.name, - build_strategy=build_strategy, - exec_strategy=exec_strategy) - else: - binary = compiler.CompiledProgram(main) - - if batch_size is not None: - batch_size *= fluid.core.get_cuda_device_count( - ) if use_cuda else int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - begin = time.time() - first_loss, = run_executor( - exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) - - for i in range(iter): - run_executor( - exe=exe, binary=binary, feed=feed_dict, fetch_list=[]) - - last_loss, = run_executor( - exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) - end = time.time() - - if batch_size is not None: - print("%.4f Instance per second" % ( - (batch_size * iter + 2) / (end - begin))) - - avg_last_loss_val = np.array(last_loss).mean() - avg_first_loss_val = np.array(first_loss).mean() - if math.isnan(float(avg_last_loss_val)) or math.isnan( - float(avg_first_loss_val)): - sys.exit("got NaN loss, training failed.") - - print(first_loss, last_loss) - # self.assertGreater(first_loss[0], last_loss[0]) - return first_loss, last_loss + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup) + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.allow_op_delay = allow_op_delay + if use_fast_executor: + exec_strategy.use_experimental_executor = True + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ + if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops + build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv + build_strategy.memory_optimize = use_ir_memory_optimize + # python memory optimization is conflict with inplace pass. + # Use ir graph memory optimization after inplace pass is the correct way. + build_strategy.enable_inplace = False if memory_opt else enable_inplace + build_strategy.enable_sequential_execution = enable_sequential_execution + + if use_cuda and core.is_compiled_with_cuda(): + build_strategy.remove_unnecessary_lock = True + if use_parallel_executor: + binary = compiler.CompiledProgram(main).with_data_parallel( + loss_name=loss.name, + build_strategy=build_strategy, + exec_strategy=exec_strategy) + else: + binary = compiler.CompiledProgram(main) + + if batch_size is not None: + batch_size *= fluid.core.get_cuda_device_count( + ) if use_cuda else int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + begin = time.time() + first_loss, = run_executor( + exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) + + for i in range(iter): + run_executor(exe=exe, binary=binary, feed=feed_dict, fetch_list=[]) + + last_loss, = run_executor( + exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) + end = time.time() + + if batch_size is not None: + print("%.4f Instance per second" % ( + (batch_size * iter + 2) / (end - begin))) + + avg_last_loss_val = np.array(last_loss).mean() + avg_first_loss_val = np.array(first_loss).mean() + if math.isnan(float(avg_last_loss_val)) or math.isnan( + float(avg_first_loss_val)): + sys.exit("got NaN loss, training failed.") + + print(first_loss, last_loss) + # self.assertGreater(first_loss[0], last_loss[0]) + return first_loss, last_loss diff --git a/python/paddle/fluid/tests/unittests/test_box_clip_op.py b/python/paddle/fluid/tests/unittests/test_box_clip_op.py new file mode 100644 index 0000000000000000000000000000000000000000..b2b0598f31dd27e12e5ce329129129b5e0f1caf0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_box_clip_op.py @@ -0,0 +1,70 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest +import copy + + +def box_clip(input_box, im_info, output_box): + im_w = round(im_info[1] / im_info[2]) + im_h = round(im_info[0] / im_info[2]) + output_box[:, :, 0] = np.maximum( + np.minimum(input_box[:, :, 0], im_w - 1), 0) + output_box[:, :, 1] = np.maximum( + np.minimum(input_box[:, :, 1], im_h - 1), 0) + output_box[:, :, 2] = np.maximum( + np.minimum(input_box[:, :, 2], im_w - 1), 0) + output_box[:, :, 3] = np.maximum( + np.minimum(input_box[:, :, 3], im_h - 1), 0) + + +def batch_box_clip(input_boxes, im_info, lod): + n = input_boxes.shape[0] + m = input_boxes.shape[1] + output_boxes = np.zeros((n, m, 4), dtype=np.float32) + cur_offset = 0 + for i in range(len(lod)): + box_clip(input_boxes[cur_offset:(cur_offset + lod[i]), :, :], + im_info[i, :], + output_boxes[cur_offset:(cur_offset + lod[i]), :, :]) + cur_offset += lod[i] + return output_boxes + + +class TestBoxClipOp(OpTest): + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "box_clip" + lod = [[1, 2, 3]] + input_boxes = np.random.random((6, 10, 4)) * 5 + im_info = np.array([[5, 8, 1.], [6, 6, 1.], [7, 5, 1.]]) + output_boxes = batch_box_clip(input_boxes, im_info, lod[0]) + + self.inputs = { + 'Input': (input_boxes.astype('float32'), lod), + 'ImInfo': im_info.astype('float32'), + } + self.outputs = {'Output': output_boxes} + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py index 6156268bf25ada310a3d22242ecff4b9cdf1759a..220bffebe83925c60af65aa9594ddd8a29c38145 100644 --- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py +++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py @@ -34,7 +34,9 @@ def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0): pb_y = pb_y.reshape(shape) if pb_v.ndim == 2: - pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1]) + var_shape = (1, pb_v.shape[0], pb_v.shape[1]) if axis == 0 else ( + pb_v.shape[0], 1, pb_v.shape[1]) + pb_v = pb_v.reshape(var_shape) if pb_v.ndim == 1: tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x tb_y = pb_v[1] * t_box[:, :, 1] * pb_h + pb_y @@ -125,33 +127,6 @@ class TestBoxCoderOp(OpTest): self.outputs = {'OutputBox': output_box} -class TestBoxCoderOpWithOneRankVar(OpTest): - def test_check_output(self): - self.check_output() - - def setUp(self): - self.op_type = "box_coder" - lod = [[1, 1, 1, 1, 1]] - prior_box = np.random.random((81, 4)).astype('float32') - prior_box_var = np.random.random((4)).astype('float32') - target_box = np.random.random((20, 81, 4)).astype('float32') - code_type = "DecodeCenterSize" - box_normalized = False - output_box = batch_box_coder(prior_box, prior_box_var, target_box, - lod[0], code_type, box_normalized) - - self.inputs = { - 'PriorBox': prior_box, - 'PriorBoxVar': prior_box_var, - 'TargetBox': target_box, - } - self.attrs = { - 'code_type': 'decode_center_size', - 'box_normalized': False - } - self.outputs = {'OutputBox': output_box} - - class TestBoxCoderOpWithoutBoxVar(OpTest): def test_check_output(self): self.check_output() @@ -210,7 +185,7 @@ class TestBoxCoderOpWithAxis(OpTest): self.op_type = "box_coder" lod = [[1, 1, 1, 1, 1]] prior_box = np.random.random((30, 4)).astype('float32') - prior_box_var = np.random.random((4)).astype('float32') + prior_box_var = np.random.random((30, 4)).astype('float32') target_box = np.random.random((30, 81, 4)).astype('float32') code_type = "DecodeCenterSize" box_normalized = False diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py index 754d5fd40953311a5deb466fa42216f72671a65a..603c8e74885d2a050e6e1e3101dce880b6eabe9c 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py @@ -16,12 +16,10 @@ import os import unittest os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" -from test_parallel_executor_transformer import TestTransformer - - -class EagerDeletionTestTransformer(TestTransformer): - pass +os.environ[ + 'RECORDIO_FILENAME'] = '/tmp/eager_deletion_transformer.wmt16.recordio' +from test_parallel_executor_transformer import TestTransformer if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py index 9962702f69644b7aef7d868f086abb390441f617..9c9f86330704466c7a8801af6ab0fb2bba23f931 100644 --- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py +++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py @@ -25,6 +25,7 @@ import paddle.fluid.layers as layers import paddle.fluid.optimizer as optimizer from paddle.fluid.framework import Program, program_guard from paddle.fluid.io import save_inference_model, load_inference_model +from paddle.fluid.transpiler import memory_optimize class TestBook(unittest.TestCase): @@ -82,9 +83,36 @@ class TestBook(unittest.TestCase): self.assertEqual(feed_var_names, ["x", "y"]) self.assertEqual(len(fetch_vars), 1) - self.assertEqual(str(fetch_vars[0]), str(avg_cost)) + print("fetch %s" % str(fetch_vars[0])) + self.assertTrue("scale" in str(fetch_vars[0])) self.assertEqual(expected, actual) +class TestSaveInferenceModel(unittest.TestCase): + def test_save_inference_model(self): + MODEL_DIR = "./tmp/inference_model2" + init_program = Program() + program = Program() + + # fake program without feed/fetch + with program_guard(program, init_program): + x = layers.data(name='x', shape=[2], dtype='float32') + y = layers.data(name='y', shape=[1], dtype='float32') + + y_predict = layers.fc(input=x, size=1, act=None) + + cost = layers.square_error_cost(input=y_predict, label=y) + avg_cost = layers.mean(cost) + + place = core.CPUPlace() + exe = executor.Executor(place) + exe.run(init_program, feed={}, fetch_list=[]) + + memory_optimize(program, print_log=True) + self.assertEqual(program._is_mem_optimized, True) + # will print warning message + save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, program) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..4e196758efc990506957089fb5b88ebb099cca29 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py @@ -0,0 +1,76 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import unittest +import numpy as np +import paddle.fluid.core as core +import paddle.fluid as fluid +from parallel_executor_test_base import TestParallelExecutorBase + + +def fc_with_batchnorm(use_feed): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + hidden = img + for _ in range(3): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + + hidden = fluid.layers.batch_norm(input=hidden) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestIrInplace(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + + def _fc_with_batchnorm(self, + ir_memory_optimize, + enable_inplace, + memory_opt=False): + + if not core.is_compiled_with_cuda(): + return + np.random.seed(5) + img = np.random.random(size=[32, 784]).astype(np.float32) + label = np.ones(shape=[32, 1], dtype='int64') + self.check_network_convergence( + fc_with_batchnorm, + feed_dict={"image": img, + "label": label}, + use_cuda=True, + memory_opt=memory_opt, + use_ir_memory_optimize=ir_memory_optimize, + enable_inplace=enable_inplace) + + def test_fc_with_batchnorm(self, delta=1e-3): + loss00 = self._fc_with_batchnorm(False, False) + loss10 = self._fc_with_batchnorm(True, False) + loss01 = self._fc_with_batchnorm(False, True) + loss11 = self._fc_with_batchnorm(True, True) + self.assertAlmostEqual(loss00, loss10, delta=delta) + self.assertAlmostEqual(loss00, loss01, delta=delta) + self.assertAlmostEqual(loss00, loss11, delta=delta) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index e7a56bb6386a812e43e5c1b5c08cd0682aa9223a..9548598d75367ed1f1863d1f6ae50b83d58f8c7f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -200,7 +200,7 @@ class TestResnet(TestParallelExecutorBase): model, use_cuda, iter=20, - delta2=1e-6): + delta2=1e-5): if use_cuda and not core.is_compiled_with_cuda(): return @@ -228,7 +228,7 @@ class TestResnet(TestParallelExecutorBase): optimizer=optimizer) for loss in zip(all_reduce_first_loss, reduce_first_loss): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) for loss in zip(all_reduce_last_loss, reduce_last_loss): self.assertAlmostEquals(loss[0], loss[1], delta=delta2) @@ -258,17 +258,17 @@ class TestResnet(TestParallelExecutorBase): enable_sequential_execution=True) for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) for loss in zip(all_reduce_last_loss, all_reduce_last_loss_seq): self.assertAlmostEquals(loss[0], loss[1], delta=delta2) for loss in zip(reduce_first_loss, reduce_first_loss_seq): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) for loss in zip(reduce_last_loss, reduce_last_loss_seq): self.assertAlmostEquals(loss[0], loss[1], delta=delta2) for loss in zip(all_reduce_first_loss_seq, reduce_first_loss_seq): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq): self.assertAlmostEquals(loss[0], loss[1], delta=delta2) @@ -277,7 +277,7 @@ class TestResnet(TestParallelExecutorBase): use_cuda=True, use_reduce=False, iter=20, - delta2=1e-6): + delta2=1e-5): if use_cuda and not core.is_compiled_with_cuda(): return @@ -308,7 +308,7 @@ class TestResnet(TestParallelExecutorBase): optimizer=optimizer) self.assertAlmostEquals( - np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6) + np.mean(parallel_first_loss), single_first_loss[0], delta=1e-5) self.assertAlmostEquals( np.mean(parallel_last_loss), single_last_loss[0], delta=delta2) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index 3827743908c1d76931572277323d1dd5ddd05523..aacc1c3ecda8c25dec9f08827a856d38c37b1b2f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -24,7 +24,7 @@ import paddle.fluid.core as core import paddle.dataset.wmt16 as wmt16 import os -WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio" +WMT16_RECORDIO_FILE = os.environ.get('RECORDIO_FILENAME', '/tmp/wmt16.recordio') class ModelHyperParams(object): diff --git a/python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py b/python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py new file mode 100644 index 0000000000000000000000000000000000000000..3673fd10c4d14ba8d7a9644dcc024f58cf92a099 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py @@ -0,0 +1,59 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import os +os.environ['FLAGS_benchmark'] = 'True' + +import numpy +import paddle.fluid.core as core +from paddle.fluid.executor import Executor +from paddle.fluid.layers import mul, data + + +class TestPeakMemoryMonitoring(unittest.TestCase): + def test_mul(self): + + a = data(name='a', shape=[784], dtype='float32') + b = data( + name='b', + shape=[784, 100], + dtype='float32', + append_batch_size=False) + out = mul(x=a, y=b) + + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + + a_np = numpy.random.random((100, 784)).astype('float32') + b_np = numpy.random.random((784, 100)).astype('float32') + self.assertEqual(0, core.get_mem_usage(0)) + exe = Executor(place) + outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out]) + out = outs[0] + #disable this assert since ctest will ignore the os.environ setting + #self.assertGreater(core.get_mem_usage(0), 0) + + raised = False + try: + core.print_mem_usage() + except: + raised = True + self.assertFalse(raised, 'Exception raised') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py index 143d187edc3a154418f9e639b7d492c8ce994d42..905b7d6fe75ab0080e3e97fbd4710ad913a05a38 100644 --- a/python/paddle/fluid/tests/unittests/transformer_model.py +++ b/python/paddle/fluid/tests/unittests/transformer_model.py @@ -17,6 +17,7 @@ from __future__ import print_function from functools import partial import numpy as np +import os import paddle.fluid as fluid import paddle.fluid.layers as layers from paddle.fluid.layers.io import open_recordio_file @@ -408,7 +409,7 @@ def transformer( trg_pad_idx, pos_pad_idx, ): file_obj = open_recordio_file( - filename='/tmp/wmt16.recordio', + filename=os.environ.get('RECORDIO_FILENAME', '/tmp/wmt16.recordio'), shapes=[ [batch_size * max_length, 1], [batch_size * max_length, 1], diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index e5d48d3d19ed71624d528144f13e23770a09362a..52c1aea288fa2bb7478ad14186367900c05f64e7 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -540,6 +540,7 @@ def memory_optimize(input_program, if skip_opt_set is not None: skip_opt_set = set(map(to_name_str, skip_opt_set)) cfgs = _get_cfgs(input_program) + input_program._is_mem_optimized = True for cfg in cfgs: cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level) @@ -559,5 +560,6 @@ def release_memory(input_program, skip_opt_set=None): None """ cfgs = _get_cfgs(input_program) + input_program._is_mem_optimized = True for cfg in cfgs: cfg.release_memory(skip_opt_set=skip_opt_set)