diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e718b32cb6c48d11e73600509a17db107f438708..d8112837dc9627bc2e501940b8e97c89e97c45ff 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -42,12 +42,6 @@ repos: entry: bash ./tools/codestyle/pylint_pre_commit.hook language: system files: \.(py)$ -- repo: https://github.com/PaddlePaddle/pre-commit-golang - sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 - hooks: - - id: go-fmt - types: - - go - repo: local hooks: - id: copyright_checker diff --git a/AUTHORS.md b/AUTHORS.md index deafa641203ed9d9bd794fe92e4a91e3aaa03f63..da91933f4697f3b7c08feab20d703e22397b3757 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -44,6 +44,7 @@ | qingqing01 | Qing-Qing Dang | | reyoung | Yang Yu | | Sand3r- | Michal Gallus | +| sfraczek | Sylwester Fraczek | | Superjom | Chun-Wei Yan | | tensor-tang | Jian Tang | | tianbingsz | Tian-Bing Xu | @@ -54,6 +55,7 @@ | wangyang59 | Yang Wang | | wangzhen-nlp | Zhen Wang | | wen-bo-yang | Wen-Bo Yang | +| wojtuss | Wojciech Uss | | wwhu | Wei-Wei Hu | | xinghai-sun | Xing-Hai Sun | | Xreki | Yi-Qun Liu | diff --git a/CMakeLists.txt b/CMakeLists.txt index 61f5e63098c40f140774ba6bfd9a2de8d2d67bfb..8e7ffe72b5fb846fb55ab8dc4809d87a40cfe06c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,23 +54,12 @@ option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) -option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF) -option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF) -option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF) option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) option(WITH_JEMALLOC "Compile PaddlePaddle with jemalloc" OFF) -option(WITH_DOC "Compile PaddlePaddle with documentation" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) -option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF) -option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF) -option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) -option(GLIDE_INSTALL "Download and install go dependencies " ON) option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(WITH_PSLIB "Compile with pslib support" OFF) -option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) -option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) -option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF) @@ -105,8 +94,6 @@ endif() if (WIN32) set(WITH_DISTRIBUTE OFF CACHE STRING "Disable DISTRIBUTE when compiling for Windows" FORCE) - set(WITH_FLUID_ONLY ON CACHE STRING - "Enable FLUID_ONLY when compiling for Windows" FORCE) endif() set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING @@ -148,7 +135,6 @@ include(external/openblas) # download, build, install openblas include(external/mkldnn) # download, build, install mkldnn include(external/ngraph) # download, build, install nGraph include(external/boost) # download boost -include(external/any) # download libn::any include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 include(external/cares) @@ -225,7 +211,6 @@ include(generic) # simplify cmake module include(package) # set paddle packages include(ccache) # set ccache for compilation include(util) # set unittest and link libs -include(rdma) # set rdma libraries include(version) # set PADDLE_VERSION include(coveralls) # set code coverage include(inference_lib) # add paddle fluid inference libraries @@ -233,38 +218,11 @@ include(inference_lib) # add paddle fluid inference libraries include_directories("${PADDLE_SOURCE_DIR}") -set(EXTERNAL_LIBS - gflags - glog - ${CBLAS_LIBRARIES} - protobuf - zlib - ${PYTHON_LIBRARIES} -) - -if(WITH_PSLIB) - list(APPEND EXTERNAL_LIBS pslib) - list(APPEND EXTERNAL_LIBS pslib_brpc) - list(APPEND EXTERNAL_LIBS libmct) -endif(WITH_PSLIB) - if(WITH_AMD_GPU) find_package(HIP) include(hip) endif(WITH_AMD_GPU) -if(WITH_MKLML) - list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB}) -endif() - -if(WITH_LIBXSMM) - list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS}) -endif() - -if(WITH_MKLDNN) - list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB}) -endif() - set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") diff --git a/Dockerfile b/Dockerfile index fe0721e9b99b5e028df2f6228ff04cb56a567a3f..c248ac119caa1f493e4866b02551eb900d3bf391 100644 --- a/Dockerfile +++ b/Dockerfile @@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh # and its size is only one-third of the official one. # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. -RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ - tar -xz -C /usr/local && \ + +RUN wget -q https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \ + tar -zxf TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz -C /usr/local && \ cp -rf /usr/local/TensorRT/include /usr && \ cp -rf /usr/local/TensorRT/lib /usr diff --git a/README.md b/README.md index 68421cf177f4cd15f8f44e8d00a27cafb5a13b91..5c428e9900762a208eebbfd053ce98663f803345 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,8 @@ English | [简体中文](./README_cn.md) [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) -[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) @@ -18,7 +18,7 @@ learning to many products at Baidu. Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. -### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) +### Latest PaddlePaddle Release: [Fluid 1.3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3) ### Install Latest Stable Release: ``` # Linux CPU @@ -26,9 +26,9 @@ pip install paddlepaddle # Linux GPU cuda9cudnn7 pip install paddlepaddle-gpu # Linux GPU cuda8cudnn7 -pip install paddlepaddle-gpu==1.2.0.post87 +pip install paddlepaddle-gpu==1.3.0.post87 # Linux GPU cuda8cudnn5 -pip install paddlepaddle-gpu==1.2.0.post85 +pip install paddlepaddle-gpu==1.3.0.post85 # For installation on other platform, refer to http://paddlepaddle.org/ ``` @@ -75,26 +75,26 @@ pip install paddlepaddle-gpu==1.2.0.post85 ## Installation -It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website. +It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) on our website. ## Documentation -We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and -[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation. +We provide [English](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) and +[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) documentation. - [Deep Learning 101](https://github.com/PaddlePaddle/book) You might want to start from this online interactive book that can run in a Jupyter Notebook. -- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) +- [Distributed Training](http://paddlepaddle.org/documentation/docs/en/1.3/user_guides/howto/training/multi_node_en.html) You can run distributed training jobs on MPI clusters. -- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) +- [Python API](http://paddlepaddle.org/documentation/docs/en/1.3/api/index_en.html) Our new API enables much shorter programs. -- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) +- [How to Contribute](http://paddlepaddle.org/documentation/docs/en/1.3/advanced_usage/development/contribute_to_paddle/index_en.html) We appreciate your contributions! diff --git a/README_cn.md b/README_cn.md index dfb55b17ca4fd05ce5b7b85b2e26e4f7f7229763..b7b0e75e5524cc483a8c203a382e7f339f91694f 100644 --- a/README_cn.md +++ b/README_cn.md @@ -3,8 +3,8 @@ [English](./README.md) | 简体中文 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) -[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) @@ -16,7 +16,7 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) -### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) +### PaddlePaddle最新版本: [Fluid 1.3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3) ### 安装最新稳定版本: ``` # Linux CPU @@ -24,9 +24,9 @@ pip install paddlepaddle # Linux GPU cuda9cudnn7 pip install paddlepaddle-gpu # Linux GPU cuda8cudnn7 -pip install paddlepaddle-gpu==1.2.0.post87 +pip install paddlepaddle-gpu==1.3.0.post87 # Linux GPU cuda8cudnn5 -pip install paddlepaddle-gpu==1.2.0.post85 +pip install paddlepaddle-gpu==1.3.0.post85 # 其他平台上的安装指引请参考 http://paddlepaddle.org/ ``` @@ -57,26 +57,26 @@ pip install paddlepaddle-gpu==1.2.0.post85 ## 安装 -推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) +推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/install/index_cn.html) ## 文档 -我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和 -[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档 +我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)和 +[中文](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) 文档 - [深度学习101](https://github.com/PaddlePaddle/book) 或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行 -- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) +- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.3/user_guides/howto/training/multi_node.html) 可以在MPI集群上运行分布式训练任务 -- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) +- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.3/api_cn/index_cn.html) 新的API支持代码更少更简洁的程序 -- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) +- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.3/advanced_usage/development/contribute_to_paddle/index_cn.html) 欢迎您的贡献! diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 10b633a4fc1063aab5c0d34b994f9c233e228f17..df159a334e86d62e175bce3b363b74ec78c1fd64 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -179,7 +179,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, else: build_strategy.reduce_strategy = fluid.BuildStrategy( ).ReduceStrategy.AllReduce - build_strategy.fuse_broadcast_op = args.fuse_broadcast_op avg_loss = train_args[0] diff --git a/cmake/configure.cmake b/cmake/configure.cmake index b0f54bf49aafb65f1a92fa95877de2cc61fc67d3..93d74bb0a8f726ad31685cbfc7831b5441cd5108 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -20,31 +20,10 @@ if(WITH_DSO) add_definitions(-DPADDLE_USE_DSO) endif(WITH_DSO) -if(WITH_DOUBLE) - add_definitions(-DPADDLE_TYPE_DOUBLE) -endif(WITH_DOUBLE) - -if(WITH_ARM_FP16) - add_definitions(-DPADDLE_ARM_FP16) - add_definitions("-march=armv8.2-a+fp16+simd") -endif(WITH_ARM_FP16) - if(WITH_TESTING) add_definitions(-DPADDLE_WITH_TESTING) endif(WITH_TESTING) -if(NOT WITH_TIMER) - add_definitions(-DPADDLE_DISABLE_TIMER) -endif(NOT WITH_TIMER) - -if(USE_EIGEN_FOR_BLAS) - add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS) -endif(USE_EIGEN_FOR_BLAS) - -if(EIGEN_USE_THREADS) - add_definitions(-DEIGEN_USE_THREADS) -endif(EIGEN_USE_THREADS) - if(NOT WITH_PROFILER) add_definitions(-DPADDLE_DISABLE_PROFILER) endif(NOT WITH_PROFILER) @@ -78,10 +57,6 @@ if(WIN32) endif(NOT MSVC) endif(WIN32) -if(NOT WITH_GOLANG) - add_definitions(-DPADDLE_WITHOUT_GOLANG) -endif(NOT WITH_GOLANG) - if(WITH_PSLIB) add_definitions(-DPADDLE_WITH_PSLIB) endif() @@ -171,55 +146,6 @@ if(WITH_DISTRIBUTE) add_definitions(-DPADDLE_WITH_DISTRIBUTE) endif() -if(WITH_GOLANG) - # we need to symlink Paddle directory into GOPATH. If we - # don't do it and we have code that depends on Paddle, go - # get ./... will download a new Paddle repo from Github, - # without the changes in our current Paddle repo that we - # want to build. - set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go") - file(MAKE_DIRECTORY ${GOPATH}) - set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle") - file(MAKE_DIRECTORY "${PADDLE_IN_GOPATH}") - set(PADDLE_GO_PATH "${CMAKE_SOURCE_DIR}/go") - - add_custom_target(go_path) - add_custom_command(TARGET go_path - # Symlink Paddle directory into GOPATH - COMMAND mkdir -p ${PADDLE_IN_GOPATH} - COMMAND rm -rf ${PADDLE_IN_GOPATH} - COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH} - # Automatically get all dependencies specified in the source code - # We can't run `go get -d ./...` for every target, because - # multiple `go get` can not run concurrently, but make need to be - # able to run with multiple jobs. - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - ) - - if (GLIDE_INSTALL) - if(EXISTS $ENV{GOPATH}/bin/glide) - set(GLIDE "$ENV{GOPATH}/bin/glide") - else() - message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide") - endif() - - # this command will only run when the file it depends is missing - # or has changed, or the output is missing. - add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide - COMMAND env GOPATH=${GOPATH} ${GLIDE} install - COMMAND touch ${CMAKE_BINARY_DIR}/glide - DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock - WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go" - ) - - # depends on the custom command which outputs - # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to - # run every time this target is built. - add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path) - endif() - -endif(WITH_GOLANG) - if(WITH_GRPC) add_definitions(-DPADDLE_WITH_GRPC) endif(WITH_GRPC) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index ef4192ecc98ea6de0c81c1f33320528d547b818a..735846db1db04e3884d72ec62d911d9a0efec147 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -168,10 +168,7 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x endif() include_directories(${CUDA_INCLUDE_DIRS}) -list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) if(NOT WITH_DSO) - # TODO(panyx0718): CUPTI only allows DSO? - list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY}) if(WIN32) set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY}) endif(WIN32) diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake index 06fc6061bc98eec8c4c71860333f7d3456952aeb..77f4b34537577c7c5782675e7af19c73bc9f8e32 100644 --- a/cmake/external/anakin.cmake +++ b/cmake/external/anakin.cmake @@ -74,5 +74,3 @@ add_dependencies(anakin_shared extern_anakin) add_library(anakin_saber SHARED IMPORTED GLOBAL) set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB}) add_dependencies(anakin_saber extern_anakin) - -list(APPEND external_project_dependencies anakin_shared anakin_saber) diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake deleted file mode 100644 index 85cce80b70a1fcf57015ac7a264e4950616b2717..0000000000000000000000000000000000000000 --- a/cmake/external/any.cmake +++ /dev/null @@ -1,31 +0,0 @@ -INCLUDE(ExternalProject) - -SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any) - -INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any) - -ExternalProject_Add( - extern_lib_any - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/PaddlePaddle/any.git" - GIT_TAG "15595d8324be9e8a9a80d9ae442fdd12bd66df5d" - PREFIX ${ANY_SOURCE_DIR} - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) - -if (${CMAKE_VERSION} VERSION_LESS "3.3.0") - set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c) - file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";") - add_library(lib_any STATIC ${dummyfile}) -else() - add_library(lib_any INTERFACE) -endif() - -add_dependencies(lib_any extern_lib_any) - -add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE) -LIST(APPEND external_project_dependencies lib_any) diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake index 12412a51a0fd1aaa9702bd4547fb935d94012ada..fc204dc9193bb28b654936048dd61a9b461abb2f 100644 --- a/cmake/external/boost.cmake +++ b/cmake/external/boost.cmake @@ -57,5 +57,4 @@ else() endif() add_dependencies(boost ${BOOST_PROJECT}) -list(APPEND external_project_dependencies boost) set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR}) diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake index 6b50cff7a66a33d9413627bfbc663cca06ba86f3..989d1dbd4cf593e779b94f7bb5eda613f000859c 100644 --- a/cmake/external/brpc.cmake +++ b/cmake/external/brpc.cmake @@ -69,5 +69,3 @@ SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) ADD_DEPENDENCIES(brpc extern_brpc) add_definitions(-DBRPC_WITH_GLOG) - -LIST(APPEND external_project_dependencies brpc) diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake index f06728de91e4509be661e56baef641d591928b66..41ad8207743201fbddd1d678fc5122afe68207ae 100644 --- a/cmake/external/cub.cmake +++ b/cmake/external/cub.cmake @@ -31,5 +31,3 @@ else() endif() add_dependencies(cub extern_cub) - -LIST(APPEND external_project_dependencies cub) diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake index 4587475d7902a134eecd54bf8241fb96d175d0ba..63dd16b28e40a0c2d5310bec011c721285049952 100644 --- a/cmake/external/dlpack.cmake +++ b/cmake/external/dlpack.cmake @@ -27,5 +27,3 @@ else() endif() add_dependencies(dlpack extern_dlpack) - -LIST(APPEND external_project_dependencies dlpack) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 6aef97f21244efd09e22781f703553a19a9e1860..72441160f89d2c188d35fc6b08b5f0b6d746a1ad 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -52,5 +52,3 @@ else() endif() add_dependencies(eigen3 extern_eigen3) - -LIST(APPEND external_project_dependencies eigen3) diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index f3ca74faea3629ddce053c49ef1e629f230fdc49..911920ed6212b87aa25ba9a1faf7696fbcb22587 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -61,8 +61,6 @@ ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES}) ADD_DEPENDENCIES(gflags extern_gflags) -LIST(APPEND external_project_dependencies gflags) - # On Windows (including MinGW), the Shlwapi library is used by gflags if available. if (WIN32) include(CheckIncludeFileCXX) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index d3a4d69d3a05515fdf72074083470e19b4ec255c..7fa17ce6b7b106c47c486729d0136748c73176a7 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -72,5 +72,3 @@ ADD_LIBRARY(glog STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES}) ADD_DEPENDENCIES(glog extern_glog gflags) LINK_LIBRARIES(glog gflags) - -LIST(APPEND external_project_dependencies glog) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 9be625b620287cd4c644ae6908000fd5eec5d5c7..e459526583bd5ee3c89807657f3c30376e57d971 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -79,5 +79,4 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES}) ADD_DEPENDENCIES(gtest_main extern_gtest) - LIST(APPEND external_project_dependencies gtest gtest_main) ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake index 0df61b01ab64c8b751bdc3893dd5294ad39ab928..ac0febd076e659927a6a882ff487c61ac130437a 100644 --- a/cmake/external/leveldb.cmake +++ b/cmake/external/leveldb.cmake @@ -39,6 +39,3 @@ ADD_DEPENDENCIES(extern_leveldb snappy) ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES}) ADD_DEPENDENCIES(leveldb extern_leveldb) - -LIST(APPEND external_project_dependencies leveldb) - diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake index 27cff8cfb6315c9b4fa5677ad9062bee73a0e5d8..b944f2945b7874ca76bf1a19e0a363f564851a62 100644 --- a/cmake/external/libmct.cmake +++ b/cmake/external/libmct.cmake @@ -72,7 +72,4 @@ else() add_library(libmct INTERFACE) endif() -#ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL) ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT}) -LIST(APPEND external_project_dependencies libmct) - diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake index 39f49d210a20d49a06c120361ecf0a5d07d1af28..69cdba7c5921f14a87172d95791332e364045b26 100644 --- a/cmake/external/libxsmm.cmake +++ b/cmake/external/libxsmm.cmake @@ -53,5 +53,3 @@ MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}") include_directories(${LIBXSMM_INCLUDE_DIR}) ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM) ADD_DEPENDENCIES(libxsmm extern_libxsmm) -LIST(APPEND external_project_dependencies libxsmm) - diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 92fe76d05c7507c295b784bc37870abfc31a0a29..b1e437a9007072c82ab375bf5ed79fc7d6c80c47 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -31,9 +31,17 @@ IF(APPLE) return() ENDIF() -MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path") +# Introduce variables: +# * CMAKE_INSTALL_LIBDIR +INCLUDE(GNUInstallDirs) +SET(LIBDIR "lib") +if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$") + SET(LIBDIR "lib64") +endif() + +MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/l${LIBDIR} to runtime path") SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/${LIBDIR}") INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers. @@ -58,7 +66,7 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${MKLDNN_DEPENDS} GIT_REPOSITORY "https://github.com/intel/mkl-dnn.git" - GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a" + GIT_TAG "863ff6e7042cec7d2e29897fe9f0872e0888b0fc" PREFIX ${MKLDNN_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} @@ -79,9 +87,9 @@ ExternalProject_Add( -DMKLROOT:PATH=${MKLML_ROOT} ) if(WIN32) - SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE) + SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE) else(WIN32) - SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) + SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) endif(WIN32) ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL) @@ -89,7 +97,6 @@ SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT}) MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}") add_definitions(-DPADDLE_WITH_MKLDNN) -LIST(APPEND external_project_dependencies shared_mkldnn) # generate a static dummy target to track mkldnn dependencies # for cc_library(xxx SRCS xxx.c DEPS mkldnn) @@ -102,7 +109,7 @@ ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) # copy the real so.0 lib to install dir # it can be directly contained in wheel or capi if(WIN32) - SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/lib/mkldnn.dll) + SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll) else(WIN32) SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 2caff27357687018f29c1efc55b7b82c9dc3ccf6..ae2679db4aed7a77ad407f881c4482fd3914ac27 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -39,8 +39,10 @@ IF(WIN32) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) -ELSE() - SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE) +ELSE() + #TODO(intel-huying): + # Now enable Erf function in mklml library temporarily, it will be updated as offical version later. + SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE) SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) @@ -73,4 +75,3 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB}) ADD_DEPENDENCIES(mklml ${MKLML_PROJECT}) -LIST(APPEND external_project_dependencies mklml) diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake index 14af98b2d74d4aa955aac27727e05567788a84c9..e7fb69dbbc872c813b2eba16a5b1098eebfeedd8 100644 --- a/cmake/external/ngraph.cmake +++ b/cmake/external/ngraph.cmake @@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs) INCLUDE(ExternalProject) SET(NGRAPH_PROJECT "extern_ngraph") -SET(NGRAPH_GIT_TAG "20bd8bbc79ae3a81c57313846a2be7313e5d1dab") +SET(NGRAPH_GIT_TAG "a444f7a959b7d87f2c117c9b57a4c387759e481e") SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) @@ -69,7 +69,7 @@ ExternalProject_Add( CMAKE_ARGS -DNGRAPH_DEX_ONLY=TRUE CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR} - CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib + CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR} CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib ) @@ -77,4 +77,3 @@ add_dependencies(ngraph ${NGRAPH_PROJECT}) target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH) target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR}) target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB}) -LIST(APPEND external_project_dependencies ngraph) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index b347a592929836a473ac764c0af1153b07d54258..d8a4a0be6f5aaa3a1a4977bbc68348743f2fa742 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -11,11 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -IF(USE_EIGEN_FOR_BLAS) - return() -ENDIF(USE_EIGEN_FOR_BLAS) - INCLUDE(cblas) IF(NOT ${CBLAS_FOUND}) @@ -91,7 +86,6 @@ ENDIF() IF(NOT ${CBLAS_FOUND}) ADD_DEPENDENCIES(cblas extern_openblas) - LIST(APPEND external_project_dependencies cblas) ELSE() IF("${CBLAS_PROVIDER}" STREQUAL "MKLML") ADD_DEPENDENCIES(cblas mklml) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index e05b7694ddf1e1652b00f156cde1a2d433c9fc46..bc7fe5454f5883108e43b4ca47920995dc13a1ff 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -129,7 +129,6 @@ macro(PROMPT_PROTOBUF_LIB) ADD_DEPENDENCIES(protoc ${dep}) ENDFOREACH() - LIST(APPEND external_project_dependencies protobuf) RETURN() endmacro() macro(SET_PROTOBUF_VERSION) @@ -231,7 +230,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ) ENDFUNCTION() -SET(PROTOBUF_VERSION 3.1) +SET(PROTOBUF_VERSION 3.1.0) IF(NOT PROTOBUF_FOUND) build_protobuf(extern_protobuf FALSE) diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake index b4ea268e5a48e29d00b0ec8b957b61a42553ec7e..0287e5cf2a835ed65c5fc26ff69d2653d5db217e 100644 --- a/cmake/external/pslib.cmake +++ b/cmake/external/pslib.cmake @@ -70,4 +70,3 @@ ExternalProject_Add( ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB}) ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT}) -LIST(APPEND external_project_dependencies pslib) diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake index 8b43f2ef5c999fc351543ba958c7cc4b0856625d..22c8c1b463764b6e107c5f3da25d51b36c2ce59f 100644 --- a/cmake/external/pslib_brpc.cmake +++ b/cmake/external/pslib_brpc.cmake @@ -70,4 +70,3 @@ ExternalProject_Add( ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB}) ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT}) -LIST(APPEND external_project_dependencies pslib_brpc) diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake index 0159815fed81bdff6de3e561af569e9edc75f947..1f56bc7ab056ef0dd95d603ebe3461ef044b2a79 100644 --- a/cmake/external/threadpool.cmake +++ b/cmake/external/threadpool.cmake @@ -26,5 +26,3 @@ else() endif() add_dependencies(simple_threadpool extern_threadpool) - -LIST(APPEND external_project_dependencies simple_threadpool) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 7a25aaf15f2c7f46d99394d82d69bc24e4f5cb2c..6f2af8670f25c00ac0970fe4ae2b0c5b03aa0d9e 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -83,5 +83,3 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include wa ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES}) ADD_DEPENDENCIES(warpctc extern_warpctc) - -LIST(APPEND external_project_dependencies warpctc) diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake index 384c2f9328296ce6a8a6293be6cc47e5063dd3c4..1d61154c0d45dea795902d6544deb796693db263 100644 --- a/cmake/external/xbyak.cmake +++ b/cmake/external/xbyak.cmake @@ -55,4 +55,3 @@ else() endif() add_dependencies(xbyak ${XBYAK_PROJECT}) -list(APPEND external_project_dependencies xbyak) diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake index a0f300c2e8bab9e7402f869eed1b4c2d1c579aab..23b1e02108642df561948a6faa3152effb7ca932 100644 --- a/cmake/external/xxhash.cmake +++ b/cmake/external/xxhash.cmake @@ -71,5 +71,3 @@ add_library(xxhash STATIC IMPORTED GLOBAL) set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES}) include_directories(${XXHASH_INCLUDE_DIR}) add_dependencies(xxhash extern_xxhash) - -LIST(APPEND external_project_dependencies xxhash) diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index 6c8d79c25e6a2655711fe4450e65600c9a584015..5569fefe992d10ad4820e51e677f40271d0214e7 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -57,5 +57,3 @@ ENDIF(WIN32) ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) ADD_DEPENDENCIES(zlib extern_zlib) - -LIST(APPEND external_project_dependencies zlib) diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 4276bc5b08cd88a52bb5782bca87fc37deabd830..c3a748db502037f926dc241e4c3bc26a83ad3468 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -11,8 +11,6 @@ include_directories("/opt/rocm/rocrand/include") include_directories("/opt/rocm/rccl/include") include_directories("/opt/rocm/thrust") -list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc") - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" ) if(WITH_DSO) @@ -31,22 +29,12 @@ if(WITH_GRPC) set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC") endif(WITH_GRPC) -if(NOT WITH_GOLANG) - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG") -endif(NOT WITH_GOLANG) - if(WITH_MKLDNN) set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN") endif(WITH_MKLDNN) set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE") -if(NOT WITH_RDMA) - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA") -endif(NOT WITH_RDMA) - - - if(CMAKE_BUILD_TYPE STREQUAL "Debug") list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") diff --git a/cmake/operators.cmake b/cmake/operators.cmake index c2d04828564e69d7ac965881057f185194aa0475..11a5b1b4554e7899c3ee7092a9295234743750d7 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -153,7 +153,11 @@ function(op_library TARGET) # pybind USE_OP_DEVICE_KERNEL for CUDNN list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len) if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0) + if(${TARGET} STREQUAL "activation") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n") + else() file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") + endif() endif() # pybind USE_OP_DEVICE_KERNEL for MIOPEN @@ -168,6 +172,9 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n") + else() file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") endif() diff --git a/cmake/rdma.cmake b/cmake/rdma.cmake deleted file mode 100644 index b698f3bdc3ff586a72badee3e0109e29285b457f..0000000000000000000000000000000000000000 --- a/cmake/rdma.cmake +++ /dev/null @@ -1,82 +0,0 @@ -# user should download rdma first from subversion repository - -# execute following instruction to download svn mannally -# svn co https://svn.baidu.com/sys/ip/trunk/rdma/sockrdmav1 rdma/ -# svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/ -# we use static output in svn repositories to avoid implict bugs from not standard runtime env. - -if(WITH_RDMA) - set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library") - - function(generate_rdma_links) - #redirect to current DIR to isolate the pollution from system runtime environment - #it can benifits unified control for different gcc environment. - #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version - #runtime libraries that will crash process while loading it. That redirect trick - #can fix it. - execute_process( - COMMAND mkdir -p librdma - COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1 - COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so - COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1 - COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so - COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so.1 - COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - ) - endfunction(generate_rdma_links) - - #check and set headers - find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include) - find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio) - find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent) - find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma) - - #check and set libs - find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output) - find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio) - find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent) - find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent) - find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent) - find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent) - find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma) - - if( - RDMA_INC_SXISOCK AND - RDMA_INC_XIO AND - RDMA_INC_EVENT AND - RDMA_INC_NUMA AND - RDMA_LIB_SXISOCK AND - RDMA_LIB_XIO AND - RDMA_LIB_EVENT AND - RDMA_LIB_EVENT_CORE AND - RDMA_LIB_EVENT_EXTRA AND - RDMA_LIB_EVENT_PTHREADS AND - RDMA_LIB_NUMA - ) - - set(RDMA_INC_DIR - ${RDMA_INC_SXISOCK} - ${RDMA_INC_XIO} - ${RDMA_INC_EVENT} - ${RDMA_INC_NUMA}) - set(RDMA_LIBS - ${RDMA_LIB_SXISOCK} - ${RDMA_LIB_XIO} - ${RDMA_LIB_EVENT} - ${RDMA_LIB_EVENT_CORE} - ${RDMA_LIB_EVENT_EXTRA} - ${RDMA_LIB_EVENT_PTHREADS} - ${RDMA_LIB_NUMA} - ) - set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma") - include_directories("${RDMA_INC_DIR}") - else() - #if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable - message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.") - endif() -else(WITH_RDMA) - set(RDMA_LIBS "") - set(RDMA_LD_FLAGS "") - add_definitions(-DPADDLE_DISABLE_RDMA) -endif(WITH_RDMA) diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake index 3dc7171551bfb7aff8d1e75083c98b00378d247f..891ff222633741f9894c2fdb6c0096a48f8a35e1 100644 --- a/cmake/tensorrt.cmake +++ b/cmake/tensorrt.cmake @@ -33,6 +33,5 @@ if(TENSORRT_FOUND) message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ") include_directories(${TENSORRT_INCLUDE_DIR}) - list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY}) add_definitions(-DPADDLE_WITH_TENSORRT) endif() diff --git a/paddle/contrib/float16/README.md b/paddle/contrib/float16/README.md index 58b4a50666bfb622af8acbce29355f2a4a870a82..a1f8cb42451dd5e84c97d6830216d284cc8bd819 100644 --- a/paddle/contrib/float16/README.md +++ b/paddle/contrib/float16/README.md @@ -5,13 +5,13 @@ Kexin Zhao ## Introduction Deep learning is usually a two-stage work: training and inference. The training stage estimates model parameters (weights) from data. The inference stage loads the weights and uses them to interpret inputs. Typically, weights are 32-bit float values (float32). Some new devices, including NVIDIA Volta GPUs, support higher speed computation using 16-bit float values (float16). -This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16. +This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16. ## What is float16? float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float or float32 data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has a high tolerance for the loss of precision and range when using float16 to represent the weights, and the inference accuracy will only be minimally affected in most cases, which gives us the opportunity to use float16 data type to speed up the inference. -Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type. +Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type. ## Why float16? The trend in today's deep learning community is to use bigger and deeper model, which translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float32 are correspondingly three-fold: @@ -24,12 +24,12 @@ The trend in today's deep learning community is to use bigger and deeper model, ## Fluid implementation of float16 inference ### Overview -Fluid use [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block. +Fluid use [Program](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block. ### Basic requirement When an executor runs an operator, it uses a kernel to perform computations on tensors contained in the input variables, and then writes the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs. -If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type. +If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type. The same principle applies if we want a program to run in float16 mode. We provide input variable of the float16 data type to the first operator, and every subsequent operator will invoke the float16 kernel until we get the final output in float16. So the preliminary requirements for float16 inference are to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax. @@ -75,7 +75,7 @@ In this scenario, we already have a float32 inference program and some associate We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. Consequently, we need to add cast operators in the float16 inference program for conversions between the float16 tensor and float32 tensor. -The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference). +The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference). ### Experiment results Simply running the following commands to reproduce the experiment results presented in this section: @@ -113,7 +113,7 @@ We repeat the test ten times and get the following results: | #10 | 62.53% | 62.48% | | average| 62.63% | 62.62% | -We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests. +We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests. #### Performance benchmark Currently, Fluid only supports float16 inference on NVIDIA GPUs. There is no motivation to support float16 inference on non-ARM CPUs where float16 is not natively supported, and float16 calculation will only be slower than its float32 counterpart. @@ -132,7 +132,7 @@ Average inference time for one mini-batch on Vgg16 model tested on ImageNet data |float16| 3.32 | 4.11 | 5.88 | 9.41 | 16.54 | 30.47 | 60.23 | |Speedup| 4.22 | 2.36  | 3.91 | 3.00 | 3.26  | 2.77 | 2.97 | -We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes. +We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes. Convolution operation is ususally the computational bottleneck of CNN, so we also check the average time spent on the Fluid convolution operators for one mini-batch as follows: @@ -162,7 +162,7 @@ We find that the speedup provided by float16 inference starts relatively small a We also did the same benchmark on a single NVIDIA GeForce GTX 1080 Ti GPU that does not support Tensor Core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then delivers around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that Tensor Core, which is specialized for float16 computations, is a critical component of high performance float16 inference. -Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/contrib/float16/float16_benchmark.md) for complete benchmark results. +Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/float16/float16_benchmark.md) for complete benchmark results. ### Summary 1. Fluid is now able to run inference in float16 mode via a float16 transpiler. We currently support CNN programs, including Vgg and Resnet, to run in float16 inference mode. diff --git a/paddle/contrib/float16/run_float16_demo.sh b/paddle/contrib/float16/run_float16_demo.sh index 031225a85dabb26e5d9ea06f58909c049e7f0c08..34cb7a12db171915f2bc7df8787dd62cd381de68 100755 --- a/paddle/contrib/float16/run_float16_demo.sh +++ b/paddle/contrib/float16/run_float16_demo.sh @@ -14,9 +14,7 @@ cmake .. -DWITH_AVX=OFF \ -DWITH_MKL=OFF \ -DWITH_GPU=ON \ -DWITH_TESTING=ON \ - -DWITH_TIMER=ON \ -DWITH_PROFILER=ON \ - -DWITH_FLUID_ONLY=ON make -j `nproc` pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)" diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index df961be911537582fb60dd258fcdd4a5dd41a38e..68c6c8fd67db9f4428d612e86305fa0ba5f98a50 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -1,518 +1,527 @@ -paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) -paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) -paddle.fluid.program_guard ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.name_scope ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)) -paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None) -paddle.fluid.scope_guard ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) -paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) -paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)) -paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.Program.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.Program.block (ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None), ('document', 'af5346376065ff4cf6832a8ac0ae0945')) +paddle.fluid.Program.clone (ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)), ('document', 'ebb7765b2962bd2be041d19720e49d0f')) +paddle.fluid.Program.current_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '5e162d3bf8dd625703463d9e4be36adb')) +paddle.fluid.Program.global_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'cfb7e05a002b2e64650778cabde7301c')) +paddle.fluid.Program.list_vars (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '1c8647b14fe57c7824b1c9562394dd3c')) +paddle.fluid.Program.parse_from_string (ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None), ('document', 'b6a7ffb239a30bf2ce58cfaca8d8b8d5')) +paddle.fluid.Program.to_string (ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)), ('document', 'faec17e5a04af28e3776160e34504d15')) +paddle.fluid.default_startup_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '99e5d53d92d82797093332719c9e3ccd')) +paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '5430f54ab4895f9f47db6bebbaf71659')) +paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b54f403e57825a1592aece03afe3afb6')) +paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2')) +paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03')) +paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'aba8093edebf2d5c869b735b92811e45')) +paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0')) +paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2')) +paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '292ab72977afbe58e6a3bde175452680')) +paddle.fluid.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '78f4949aedf317666a89ca74b3748ba8')) +paddle.fluid.DistributeTranspiler.get_startup_program (ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'd796fc0c8d51503b556fcf6dc15c4f0c')) +paddle.fluid.DistributeTranspiler.get_trainer_program (ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)), ('document', '736330e31a7a54abccc0c7fd9119d9ff')) +paddle.fluid.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')), ('document', '06ce55338dfe96311ad1078235ab3bf4')) +paddle.fluid.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)), ('document', 'eda17d0f1639bc6ca215cecf87f588a4')) +paddle.fluid.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ac4114d3df16264f1946deb3a8434a6f')) paddle.fluid.DistributeTranspilerConfig.__init__ -paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) -paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) -paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None) -paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, '')) -paddle.fluid.AsyncExecutor.config_distributed_nodes ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.download_data ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)) -paddle.fluid.AsyncExecutor.get_instance ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.init_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.init_server ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False)) -paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None) -paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.CompiledProgram.with_inference_optimize ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None) +paddle.fluid.ParallelExecutor.__init__ (ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '2cb4bd74481861345c70228a0f57620c')) +paddle.fluid.create_lod_tensor (ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None), ('document', '8e7bb21e83ff4604f5b379672e285b94')) +paddle.fluid.create_random_int_lodtensor (ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '368f638b99f1dfe59e9b02aa6f077752')) +paddle.fluid.DataFeedDesc.__init__ (ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.DataFeedDesc.desc (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '4294493e31c4bc9fc4bd48753044235f')) +paddle.fluid.DataFeedDesc.set_batch_size (ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8d9f44601e0a99dd431f14fd9250cd21')) +paddle.fluid.DataFeedDesc.set_dense_slots (ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'eb894b464bbcd1b4bc8038398954f766')) +paddle.fluid.DataFeedDesc.set_use_slots (ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None), ('document', '415c56600ce4e198c071cad01409a690')) +paddle.fluid.AsyncExecutor.__init__ (ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, '')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.AsyncExecutor.config_distributed_nodes (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '4810dbe1870452f16b3c60b6c5fd1459')) +paddle.fluid.AsyncExecutor.download_data (ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)), ('document', '799a2066cc26819f1ed31f47c15ad083')) +paddle.fluid.AsyncExecutor.get_instance (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f8688f76a2db1243c7097a60c507b182')) +paddle.fluid.AsyncExecutor.init_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '504f39be2007404a17e5cabea1256c7d')) +paddle.fluid.AsyncExecutor.init_server (ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None), ('document', 'c403ab46c5d3ef25c0f7e94ae75dcb68')) +paddle.fluid.AsyncExecutor.init_worker (ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None), ('document', 'dcf08f4bf2f3282acf11391f5d39c536')) +paddle.fluid.AsyncExecutor.run (ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False)), ('document', '848fc53484e8326f6325feea87fe955c')) +paddle.fluid.AsyncExecutor.save_model (ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None), ('document', 'c8ac0dfcb3b187aba25d03af7fea56b2')) +paddle.fluid.AsyncExecutor.stop (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '5f23d043607bb5d55e466ec3f578e093')) +paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'e1af7fd53cf868554f312779fc803864')) +paddle.fluid.CompiledProgram.with_inference_optimize (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None), ('document', '9e5b009d850191a010e859189c127fd8')) paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None -paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)) -paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)) -paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)) -paddle.fluid.initializer.NormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)) -paddle.fluid.initializer.TruncatedNormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)) -paddle.fluid.initializer.XavierInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0)) -paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)) -paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None) -paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None) -paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)) -paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')) -paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)) -paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None)) -paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)) -paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)) -paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) -paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)) -paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) -paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) -paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)) -paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) -paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) -paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) -paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) -paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) -paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) -paddle.fluid.layers.data_norm ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, False, None, None, None, False)) -paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) -paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) -paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)) -paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.layers.sequence_unpad ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)) -paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) -paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) -paddle.fluid.layers.reduce_max ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) -paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) -paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) -paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')) -paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)) -paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)) -paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)) -paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)) -paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)) -paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) -paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)) -paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)) -paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)) -paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) -paddle.fluid.layers.group_norm ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)) -paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False)) -paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)) -paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None)) -paddle.fluid.layers.squeeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.layers.lrn ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None)) -paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)) -paddle.fluid.layers.pad_constant_like ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)) -paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)) -paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)) -paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)) -paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)) -paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)) -paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)) -paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)) -paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)) -paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.selu ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.margin_rank_loss ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None)) -paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None)) -paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None)) -paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)) -paddle.fluid.layers.stanh ArgSpec(args=['x', 'scale_a', 'scale_b', 'name'], varargs=None, keywords=None, defaults=(0.6666666666666666, 1.7159, None)) -paddle.fluid.layers.hard_sigmoid ArgSpec(args=['x', 'slope', 'offset', 'name'], varargs=None, keywords=None, defaults=(0.2, 0.5, None)) -paddle.fluid.layers.swish ArgSpec(args=['x', 'beta', 'name'], varargs=None, keywords=None, defaults=(1.0, None)) -paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.layers.brelu ArgSpec(args=['x', 't_min', 't_max', 'name'], varargs=None, keywords=None, defaults=(0.0, 24.0, None)) -paddle.fluid.layers.leaky_relu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(0.02, None)) -paddle.fluid.layers.soft_relu ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(40.0, None)) -paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)) -paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None)) -paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)) -paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None)) -paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None)) -paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None)) -paddle.fluid.layers.expand ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.sequence_concat ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.scale ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None)) -paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) -paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) -paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) -paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) -paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) -paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) -paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) -paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0)) -paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')) -paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')) -paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')) -paddle.fluid.layers.sum ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.logical_and ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.layers.logical_or ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.layers.logical_xor ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.layers.logical_not ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) -paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False)) -paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)) -paddle.fluid.layers.similarity_focus ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)) -paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) -paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) -paddle.fluid.layers.shuffle_channel ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)) -paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.tree_conv ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)) -paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) -paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) -paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.shuffle ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)) -paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)) -paddle.fluid.layers.create_py_reader_by_data ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)) -paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.Preprocessor.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.Preprocessor.outputs ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None) -paddle.fluid.layers.load ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)) -paddle.fluid.layers.create_parameter ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)) -paddle.fluid.layers.create_global_var ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)) -paddle.fluid.layers.cast ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.tensor_array_to_tensor ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)) -paddle.fluid.layers.concat ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)) -paddle.fluid.layers.sums ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.assign ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.fill_constant_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'value', 'input_dim_idx', 'output_dim_idx'], varargs=None, keywords=None, defaults=(0, 0)) -paddle.fluid.layers.fill_constant ArgSpec(args=['shape', 'dtype', 'value', 'force_cpu', 'out'], varargs=None, keywords=None, defaults=(False, None)) -paddle.fluid.layers.argmin ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)) -paddle.fluid.layers.argmax ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)) -paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(-1, None)) -paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.has_inf ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.has_nan ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.isfinite ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)) -paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)) -paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None)) -paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,)) -paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.IfElse.false_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None) -paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.DynamicRNN.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.DynamicRNN.memory ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')) -paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None) -paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)) -paddle.fluid.layers.StaticRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None) -paddle.fluid.layers.StaticRNN.step ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.StaticRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.StaticRNN.step_output ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.Print ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')) -paddle.fluid.layers.is_empty ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,)) -paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.exp ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.tanh ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.tanh_shrink ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.softshrink ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.sqrt ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.abs ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.ceil ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.floor ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.cos ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.sin ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.round ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.reciprocal ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.square ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.softplus ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.softsign ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)) -paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)) -paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)) -paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)) -paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)) -paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')) -paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)) -paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)) -paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)) -paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)) -paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) -paddle.fluid.layers.generate_mask_labels ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)) -paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.box_clip ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.multiclass_nms ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)) -paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) -paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) -paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.layers.polynomial_decay ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)) -paddle.fluid.layers.piecewise_decay ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.noam_decay ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.append_LARS ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.InitState.__init__ ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')) -paddle.fluid.contrib.StateCell.__init__ ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.StateCell.compute_state ArgSpec(args=['self', 'inputs'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.StateCell.get_input ArgSpec(args=['self', 'input_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.StateCell.get_state ArgSpec(args=['self', 'state_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.StateCell.out_state ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.StateCell.set_state ArgSpec(args=['self', 'state_name', 'state_value'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.StateCell.state_updater ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.StateCell.update_states ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.TrainingDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.TrainingDecoder.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None) -paddle.fluid.contrib.TrainingDecoder.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.TrainingDecoder.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.BeamSearchDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None)) -paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)) -paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000)) -paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) -paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.contrib.Calibrator.__init__ ArgSpec(args=['self'], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.contrib.Calibrator.sample_data ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.Calibrator.save_int8_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) -paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) -paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.CompressPass.apply ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.ImitationGraph.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.ImitationGraph.all_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.SensitivePruneStrategy.__init__ ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None)) -paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.MagnitudePruner.__init__ ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.MagnitudePruner.prune ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.RatioPruner.__init__ ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.RatioPruner.prune ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.load_persistables_for_increment ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.load_persistables_for_inference ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.convert_dist_to_sparse_program ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.HDFSClient.__init__ ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.HDFSClient.delete ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.HDFSClient.download ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)) -paddle.fluid.contrib.HDFSClient.is_dir ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.HDFSClient.is_exist ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.contrib.HDFSClient.ls ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.HDFSClient.lsr ArgSpec(args=['self', 'hdfs_path', 'only_file', 'sort'], varargs=None, keywords=None, defaults=(True, True)) -paddle.fluid.contrib.HDFSClient.make_local_dirs ArgSpec(args=['local_path'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.HDFSClient.makedirs ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None) -paddle.fluid.contrib.HDFSClient.rename ArgSpec(args=['self', 'hdfs_src_path', 'hdfs_dst_path', 'overwrite'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.contrib.HDFSClient.upload ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'retry_times'], varargs=None, keywords=None, defaults=(False, 5)) -paddle.fluid.contrib.multi_download ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)) -paddle.fluid.contrib.multi_upload ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)) -paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) -paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) -paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) -paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) -paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)) -paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None) -paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None) -paddle.fluid.transpiler.HashName.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.transpiler.RoundRobin.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None) -paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None) -paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.io.save_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b55d6193a1d4198d45b013fc5779e1f2')) +paddle.fluid.io.save_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '3a7a99abac3e1bf898871fe609354218')) +paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '9141bb5f32caf7975eb3fd88c8a1b2da')) +paddle.fluid.io.load_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '0a5308f496632ab1ec3ba1f1377e6f95')) +paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '41779819cef32f2246e83aebc5a002e2')) +paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '28df5bfe26ca7a077f91156abb0fe6d2')) +paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '582d87b8df75a5a639a107db8ff86f9c')) +paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '7a5255386075dac3c75b7058254fcdcb')) +paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.initializer.NormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.initializer.TruncatedNormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.initializer.XavierInitializer.__init__ (ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.initializer.BilinearInitializer.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd389912dc079cbef432335a00017cec0')) +paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)), ('document', '53c757bed9345f2ad3361902531e7cf5')) +paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '6d0f3e22c90d9d500d36ff57daf056ee')) +paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'a6d7011ca3d8c0d454dac3a56eae0c29')) +paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '1929058262994f212620599c63aea6bd')) +paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '89c2c55a0b0656b106064048e068e77a')) +paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'dfbb624f85015df29e994ca6999e8ff6')) +paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b4b608b986eb9617aa0525e1be21d32d')) +paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '4ec4845fd7d991bcac822f8b0dfc101f')) +paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', 'e0e2439f7af069b57badca18a6ba60b8')) +paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)), ('document', '7c49ef4bbf0adfd4b9a1d98e2e5f3fea')) +paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)), ('document', '7642373ab65d3fc3b96d16d10fef1538')) +paddle.fluid.layers.cos_sim (ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None), ('document', 'd740824aa7316b807c4b4a3c6c8c0bbe')) +paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', '025b364dafb4b7975c801eb33e7831a1')) +paddle.fluid.layers.bpr_loss (ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '30add751a0f99347a6257634c03ff254')) +paddle.fluid.layers.square_error_cost (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', '44b6eef4a0f2bc15f7d9745782406736')) +paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ee152a7ba3036e7b9ede9184545179b4')) +paddle.fluid.layers.sequence_conv (ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)), ('document', 'b6543768e1afaa2ecb869709d6e9c7e2')) +paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '8ca6121acd6d23cd8806a93f493c2e17')) +paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '37042620f9bd3a2da6e5d3138b2f724b')) +paddle.fluid.layers.sequence_pool (ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,)), ('document', 'a194fb80614023f543df3949fbd0d0b8')) +paddle.fluid.layers.sequence_softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '19ef6f9cdd27feac8a1ae060f19c10b4')) +paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'f19dd380864e61134ce3814e4be0de4b')) +paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', 'bbd84e855e660cd1084bb71a2fd0cdaa')) +paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625')) +paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95')) +paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '120f4323a3d7ed9c0916f15a59f0e497')) +paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', 'c527b71b8a4c60dca8df8a745c2b598d')) +paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', 'e45e09e65a2658e07cad987222f0d9ab')) +paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b0b8d53821716cd50c42e09b593f3feb')) +paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '03993955ab1e6d3044c44e6f17fc85e9')) +paddle.fluid.layers.conv3d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', 'ec113c6a3686ac94f8fccd1a7953d445')) +paddle.fluid.layers.sequence_expand (ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '79c375214fa427faac504043d162dae9')) +paddle.fluid.layers.sequence_expand_as (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9d2611f84ab364c5da545e6a82f1770a')) +paddle.fluid.layers.sequence_pad (ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6a1adf3067b20f6e4bcb354d71c19184')) +paddle.fluid.layers.sequence_unpad (ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd12803c903c99aa36ec03aaac5f0cc5b')) +paddle.fluid.layers.lstm_unit (ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)), ('document', '027723966f3ef0d7bc598f22287a96cc')) +paddle.fluid.layers.reduce_sum (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'b69998ce3ff4980fb21da0df05565f1b')) +paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd4d80dd98a1a5839f41eeb3a0f85f370')) +paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '66a622db727551761ce4eb73eaa7f6a4')) +paddle.fluid.layers.reduce_min (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd50ac552b5d131468ed466d08bb2d38c')) +paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'fcd8301a0ce15f219c7a4bcd0c1e8eca')) +paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3')) +paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce')) +paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6')) +paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', 'dc7042734c6d8b8ce97321f017f01d6f')) +paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6')) +paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2')) +paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571')) +paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '6e428384ce6a77207fa2c70d9f011990')) +paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', 'b4cbe1ac451005df6dad12e9ffdccca9')) +paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd3570c02f71bcd78e60b3f31dc8f5b32')) +paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', 'aaba49c038ba927f0a8e45c0c9a686ab')) +paddle.fluid.layers.sequence_reshape (ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None), ('document', 'a10ab9bf88d4a7e328882d411abb6fd1')) +paddle.fluid.layers.transpose (ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a1feac48b843d679db82312dc85885f4')) +paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)), ('document', '3ce01160ede80b1c26f776f8fef9340f')) +paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', 'fddad4896dee5193e1cdf70882c2a347')) +paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0)), ('document', '5db30b8a74e8c93687943a3e8d221da0')) +paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', '80641ee6810b1cdc3fd6e14fc89ecc9d')) +paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', 'b350b9a30a18e7efd7e1bb740eef6996')) +paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', '17485788fffe4e2d36dc58c2ac8d174e')) +paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '013795af319e2e86d3506741941078ee')) +paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', 'de6a906950bae9f3c245cb744d22b94e')) +paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '419c3a24a83cc89219a029cf4092788b')) +paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '3f536aafba30d793287b52d231baff1b')) +paddle.fluid.layers.softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, True, False)), ('document', 'bce1b75e3d95b75cacd1099655cbb3c3')) +paddle.fluid.layers.smooth_l1 (ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'c6b175d253c55baf4b9c0eca9b1dda88')) +paddle.fluid.layers.one_hot (ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None), ('document', '6148b6a555cbfb62fdcd030d8982c18c')) +paddle.fluid.layers.autoincreased_step_counter (ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)), ('document', '3f6c828594720c9b2da89c464be94478')) +paddle.fluid.layers.reshape (ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', '323c019f257e55ddea4a824a362de62f')) +paddle.fluid.layers.squeeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3229d06517f794e86ca3da14c38b1465')) +paddle.fluid.layers.unsqueeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbd62da391b1df984a1909d069a759b2')) +paddle.fluid.layers.lod_reset (ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'f122194c562bd674f6ecdccf33785f99')) +paddle.fluid.layers.lrn (ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None)), ('document', '0795e9940e42dcd62953514ff7e09f77')) +paddle.fluid.layers.pad (ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '2f28153bdd2d5ea6f7bad5867bd03eeb')) +paddle.fluid.layers.pad_constant_like (ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', 'd2e1f45fef51b2c214e3f2aa8976c46c')) +paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)), ('document', '70c113658102a11cc5d8e3d45145737a')) +paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'c317aa595deb31649083c8faa91cdb97')) +paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '12c5bbb8b38c42e623fbc47611d766e1')) +paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)), ('document', '1ba0508d573f65feecf3564dce22aa1d')) +paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', '7a1966d7c3a48f1fc0881cdaf5d83b0b')) +paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', '06211aefc50c5a3e940d7204d859cdf7')) +paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', 'e4fb4ed511b2293b8f04f7e872afbfd7')) +paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', '735fa9758a6d7ff3b47d7b827f961c1d')) +paddle.fluid.layers.gather (ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None), ('document', '98f1c86716b9b7f4dda83f20e2adeee2')) +paddle.fluid.layers.scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '65f8e9d8ddfd0b412f940579c4faa342')) +paddle.fluid.layers.sequence_scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '15b522457dfef103f0c20ca9d397678b')) +paddle.fluid.layers.random_crop (ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c9ab9e460ef0a1823249935a30e82c66')) +paddle.fluid.layers.mean_iou (ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None), ('document', '35cbbdfa585d027bb490707c95a176b9')) +paddle.fluid.layers.relu (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '866ffa1cc93f29e23662b526a7596537')) +paddle.fluid.layers.selu (ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '9044c7fe667b76cb2d9264f2db11f417')) +paddle.fluid.layers.log (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '98247c59d1c9b40af6730001b2aea73d')) +paddle.fluid.layers.crop (ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '883104791204d3127e24234bb630b2e7')) +paddle.fluid.layers.rank_loss (ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c542e39ac6add24a6bef6e79bf5617e2')) +paddle.fluid.layers.margin_rank_loss (ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None)), ('document', '6d19dcc19917080b7ff3e03bde451bc8')) +paddle.fluid.layers.elu (ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', '463258ee9f8b60760eb1e26357cc9bfa')) +paddle.fluid.layers.relu6 (ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None)), ('document', '6f367339caf6c7124bc262fe1475df70')) +paddle.fluid.layers.pow (ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', 'a5117c1eb84aca2ac0b0abab337a4799')) +paddle.fluid.layers.stanh (ArgSpec(args=['x', 'scale_a', 'scale_b', 'name'], varargs=None, keywords=None, defaults=(0.6666666666666666, 1.7159, None)), ('document', '959936a477efc6c1447a9c8bf8ce94bb')) +paddle.fluid.layers.hard_sigmoid (ArgSpec(args=['x', 'slope', 'offset', 'name'], varargs=None, keywords=None, defaults=(0.2, 0.5, None)), ('document', 'c82059b6fea1aa730f9aac911807b756')) +paddle.fluid.layers.swish (ArgSpec(args=['x', 'beta', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', 'ef745e55a48763ee7b46b21a81dc7e84')) +paddle.fluid.layers.prelu (ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'f6acef7ff7d887e49ff499fbb1dad4a9')) +paddle.fluid.layers.brelu (ArgSpec(args=['x', 't_min', 't_max', 'name'], varargs=None, keywords=None, defaults=(0.0, 24.0, None)), ('document', '3db337c195e156e6ef2b8b4a57113600')) +paddle.fluid.layers.leaky_relu (ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(0.02, None)), ('document', 'f878486c82b576938151daad0de995a0')) +paddle.fluid.layers.soft_relu (ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(40.0, None)), ('document', '869adce548c342d6cc1bd88a948d83c9')) +paddle.fluid.layers.flatten (ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'cb295c13cb957db85cd9609269d7784d')) +paddle.fluid.layers.sequence_mask (ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None)), ('document', 'f0dd6eddd3bff015a3c05269d82fcbd8')) +paddle.fluid.layers.stack (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', '367cfbb642839beacb5d117e2d2b4041')) +paddle.fluid.layers.pad2d (ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None)), ('document', '7f4d46320cc077ca2e8db600c35f4030')) +paddle.fluid.layers.unstack (ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None)), ('document', '98eb9d633116efcfc6f90c114bd44fd6')) +paddle.fluid.layers.sequence_enumerate (ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'f6028537085dc296103bbbd85fa7763d')) +paddle.fluid.layers.expand (ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '117d3607d1ffa0571835bbaebc7857ff')) +paddle.fluid.layers.sequence_concat (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a1d155dd1bf6e72a0a3e3e1519591d1')) +paddle.fluid.layers.scale (ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None)), ('document', '30190413b2fa442e7466d6cf2ce5ea07')) +paddle.fluid.layers.elementwise_add (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '6bfbe72cbadc95ac7ab88c05ed5bf9f0')) +paddle.fluid.layers.elementwise_div (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'cc6e6cc1cb942a152dde3ef08d5f165c')) +paddle.fluid.layers.elementwise_sub (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'a12abdab09c3e57af5a6e1e9f138684a')) +paddle.fluid.layers.elementwise_mul (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '422c77dbfcff355a57b5fdd4ec876daa')) +paddle.fluid.layers.elementwise_max (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'f0bb0b2c454541cfafa761021a5cc776')) +paddle.fluid.layers.elementwise_min (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '8a9cdefefbccbf9f6b0991c0946a21e9')) +paddle.fluid.layers.elementwise_pow (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '1aea4e197c552a284f83888a3c67a32e')) +paddle.fluid.layers.uniform_random_batch_size_like (ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0)), ('document', '129e0a3257f1d532a948eedf9d5bf671')) +paddle.fluid.layers.gaussian_random (ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', '389dafe36e099841b6a7fb18d11f1b4c')) +paddle.fluid.layers.sampling_id (ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', '840fdac643d1341c1cae218d4511dbb9')) +paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')), ('document', '840026b4766613c5705e06563cd103b6')) +paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'a418e3ccb5e2ac21bd60f5cc221d5860')) +paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '01dbb91e7c74cb11336cd531013de51a')) +paddle.fluid.layers.shape (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '17db0f814eb7bb5a3fac1ca6e60e16d8')) +paddle.fluid.layers.logical_and (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cdcf20c494c92060d10feb9374532f42')) +paddle.fluid.layers.logical_or (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '0eae3f726a4afe590757552fa3ced012')) +paddle.fluid.layers.logical_xor (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b0daaa3fa4a0aa62f9b58c43d959eb25')) +paddle.fluid.layers.logical_not (ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cd1c8cf31e040427d4e05711044caeb6')) +paddle.fluid.layers.clip (ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b020b7aab59719be98a4ae229a76deba')) +paddle.fluid.layers.clip_by_norm (ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a1ea0bc5a926f427458c4254ca022749')) +paddle.fluid.layers.mean (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd638d915195ce86a8d7963b81110d4c8')) +paddle.fluid.layers.mul (ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)), ('document', 'ccd37fa6b53f074adbfb732d738c4c2d')) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits (ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False)), ('document', '180c284317ea45ef89a460d8d79c0b72')) +paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '891870d069a6aea746d34cc53b61690c')) +paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5f207ae10589ebe38a63575ef6ff8e1e')) +paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '51def402b8910e163cbace9d0c0526ed')) +paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '77a6d80aa5551ca70324fc975c44507f')) +paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)), ('document', '2f46f1ff39a13ab00857e7b9f44b2fa7')) +paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '70e3b5182a18b40b47ecabd7c8490a35')) +paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', '9bb77f8dc002dd2ce75d4769eaaf5007')) +paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd256cba1c41a5ed92ce3f31e24a2ca6d')) +paddle.fluid.layers.log_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)), ('document', '4b5a2341023afe63157a066c14254f98')) +paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '4b9c2e8af5817937d831820874b5aa77')) +paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'aa7540a0fa73ff69a02e11b4091aab75')) +paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'dc63315b84f591ac79ecca0c3632027a')) +paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7ffc849e71f31dfe29030ff94e662de6')) +paddle.fluid.layers.lstm (ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)), ('document', 'd5e6c494ac35100e2ed4d4bd9a1ed932')) +paddle.fluid.layers.shuffle_channel (ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2fa6782d43d02ae64482d21235a82949')) +paddle.fluid.layers.py_func (ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)), ('document', '8404e472ac12b4a30a505d3d3a3e5fdb')) +paddle.fluid.layers.psroi_pool (ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1546136806fef5c08f6918544bd9151d')) +paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)), ('document', '2f6ff96864054a31aa4bb659c6722c99')) +paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '431a4301c35032166ec029f7432c80a7')) +paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607')) +paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329')) +paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139')) +paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc')) +paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', 'b0a1c2fc51c27a106da28f3308c41f5e')) +paddle.fluid.layers.shuffle (ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None), ('document', 'f967a73426db26f970bc70bfb03cffca')) +paddle.fluid.layers.batch (ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', 'f563d376d35e1a4c4db100fd11b381a0')) +paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '07e5b796674796eb1ef3fee9c10d24e3')) +paddle.fluid.layers.random_data_generator (ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)), ('document', '9b7f0f86ec24bbc97643cadcb6499cff')) +paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '13dabc57863f62ab3141586784ee356b')) +paddle.fluid.layers.create_py_reader_by_data (ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)), ('document', '350f74d93fab9adb2ac4950f1c26416b')) +paddle.fluid.layers.Preprocessor.__init__ (ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Preprocessor.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Preprocessor.inputs (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Preprocessor.outputs (ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,)), ('document', '9d1a4bc97bbce9fa1d4f7a4200a771ff')) +paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'c0c3d0194f83fff8ea99ce0820657dae')) +paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'd62b866c899bc1fedb5385f95b88e1f8')) +paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', 'ab914fac893607e29ac6e52bbdbea1a4')) +paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '992eb42590fc1c380841a6db72ce78b3')) +paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'b12717d3d4567e6119589f7f655b0cbb')) +paddle.fluid.layers.concat (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b19b79be4f05e85d1d6cec642c9fb535')) +paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '42912092418620b4be07f36af31e7816')) +paddle.fluid.layers.assign (ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b690184f3537df5501e4d9d8f31152a5')) +paddle.fluid.layers.fill_constant_batch_size_like (ArgSpec(args=['input', 'shape', 'dtype', 'value', 'input_dim_idx', 'output_dim_idx'], varargs=None, keywords=None, defaults=(0, 0)), ('document', 'd4059a2f5763036b07018d76429f9acb')) +paddle.fluid.layers.fill_constant (ArgSpec(args=['shape', 'dtype', 'value', 'force_cpu', 'out'], varargs=None, keywords=None, defaults=(False, None)), ('document', '1d8b14729639fa38509c79b9784740fa')) +paddle.fluid.layers.argmin (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', '2778a1d34be49263a51211885599ea37')) +paddle.fluid.layers.argmax (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', '04114996cfb98994ba222804a1a6109f')) +paddle.fluid.layers.argsort (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '68ec45c6fb6b93e47de9c9a0945fb98e')) +paddle.fluid.layers.ones (ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)), ('document', 'b402489c62e668df42e7daceb63c142b')) +paddle.fluid.layers.zeros (ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)), ('document', 'c155e2efc56ffa5ed4658cca0272e491')) +paddle.fluid.layers.reverse (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None), ('document', '8ee7cb6ca639e7460e825f953b65d94d')) +paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '8f8c0306117ea441f20dcbbdba1f0ecc')) +paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8')) +paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292')) +paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Switch.case (ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None), ('document', 'f7c7160014c1b46cfeda9dd5808d1789')) +paddle.fluid.layers.Switch.default (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '50853ae884df03d9c36703bb46d9ef07')) +paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)), ('document', '73bb96ec4783ec1a11e760e8851b0e77')) +paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '40b6d15f4c86b2b09df340d7778ad713')) +paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '2d4f20087080ba5105b55205ad5c5b6a')) +paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=None, defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f')) +paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '80c29b1dc64718f0116de90d1ac88a77')) +paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', '0275133f1dde2aed528b4d3230edf823')) +paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', 'ffb8b9578ec66db565b223d313aa82a2')) +paddle.fluid.layers.IfElse.__init__ (ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.IfElse.false_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.IfElse.input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.IfElse.output (ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.IfElse.true_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.DynamicRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.DynamicRNN.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6d3e0a5d9aa519a9773a36e1620ea9b7')) +paddle.fluid.layers.DynamicRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')), ('document', 'b9174d4e91505b0c8ecc193eb51e248d')) +paddle.fluid.layers.DynamicRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'b439a176a3328de8a75bdc5c08eece4a')) +paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', 'f29ad2478b6b2ad4f413d2936a331ea0')) +paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '169d694d2224f62b4f3afdc3dbc19e95')) +paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f')) +paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'c24e368e23afac1ed91a78a639d7a9c7')) +paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a')) +paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732')) +paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519')) +paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '29a25ba78de79152076cacfc5443137d')) +paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3')) +paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b')) +paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6')) +paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a46e0b5f9ce82348406478e610f14c9')) +paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7')) +paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13')) +paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27')) +paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '64650ac42cf82e9920cb0b172b1d29fd')) +paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad')) +paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973')) +paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2')) +paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '920a47734482276c069ba24c61c26b25')) +paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cf4ee2c9b9d7293556f8c5173dfb5d2c')) +paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4')) +paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2')) +paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26')) +paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '48dfb45d773dbc30126c3a7f777de5ee')) +paddle.fluid.layers.softplus (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '459c5781e9d1dd88283b7c5769d7872a')) +paddle.fluid.layers.softsign (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '80846bcd4bd457207457a6d5411f4148')) +paddle.fluid.layers.uniform_random (ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0)), ('document', '308b619af849caa82bbc31e897f5e641')) +paddle.fluid.layers.hard_shrink (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c142f5884f3255e0d6075c286bbd531e')) +paddle.fluid.layers.cumsum (ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '944d7c03057f5fc88bc78acd4d82f926')) +paddle.fluid.layers.thresholded_relu (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '90566ea449ea4c681435546e2f70610a')) +paddle.fluid.layers.prior_box (ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)), ('document', '14cac0ee643fa6e026ad82aeeee75bd8')) +paddle.fluid.layers.density_prior_box (ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)), ('document', 'a0d762bb08de9ce93bc780aa57cd5cd9')) +paddle.fluid.layers.multi_box_head (ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)), ('document', 'a6ab47a2fe681e52fabb7057ddf0efdd')) +paddle.fluid.layers.bipartite_match (ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '3ddb9b966f193900193a95a3df77c3c1')) +paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'c0b334f917828f95056f6ebe10907b1c')) +paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)), ('document', 'c33093a82a46e3091e789e5572588db1')) +paddle.fluid.layers.ssd_loss (ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)), ('document', '6d5028fd09d01ab82d296adc0ea95aee')) +paddle.fluid.layers.detection_map (ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')), ('document', '1467d91b50c22cd52103b4aa1ee9d0a1')) +paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', '1dddef3eb4b3cbd4df8e03ac480dbf97')) +paddle.fluid.layers.anchor_generator (ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)), ('document', '23337cc57bbf5be73884b6bd0f849603')) +paddle.fluid.layers.roi_perspective_transform (ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)), ('document', '5761f9ed83654314416e24372b33bb84')) +paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)), ('document', '87863717edeb7fe87a1268976cbc015d')) +paddle.fluid.layers.generate_proposals (ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)), ('document', '57ab49f3f324f310b7eed322e7c1057a')) +paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None), ('document', 'f73706a65468e9ca3e0bee4a31521b0a')) +paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1')) +paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e')) +paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b')) +paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691')) +paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e')) +paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0')) +paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d')) +paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '005a5ae47d6c8fff721931d69d072b9f')) +paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', '9808534c12c5e739a10f73ebb0b4eafd')) +paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', 'e0e95334fce92d16c2d9db6e7caffc47')) +paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '98a5050bee8522fcea81aa795adaba51')) +paddle.fluid.layers.natural_exp_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '676a7bc2a218691db50bca233903d21e')) +paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'd07e767d59c4a5e6c930f3e6756d3f82')) +paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', '882634f420f626642f0874481263da40')) +paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'c717d9d1d78a53c809d01b8bc56f3cae')) +paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'd9a95746353fd574be36dc28d8726c28')) +paddle.fluid.layers.append_LARS (ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None), ('document', 'd24fa1e7d62ac8a534fc6a86002f84f8')) +paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', '9588c64c26ffaef3c466e404a6af9d9b')) +paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.StateCell.__init__ (ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.StateCell.compute_state (ArgSpec(args=['self', 'inputs'], varargs=None, keywords=None, defaults=None), ('document', '92973b3f222081a1d17069c683cf4a99')) +paddle.fluid.contrib.StateCell.get_input (ArgSpec(args=['self', 'input_name'], varargs=None, keywords=None, defaults=None), ('document', '6f24a007cfa184e32f01a960703bfd70')) +paddle.fluid.contrib.StateCell.get_state (ArgSpec(args=['self', 'state_name'], varargs=None, keywords=None, defaults=None), ('document', '630a4945cfe659ea4f307598fbbce5d2')) +paddle.fluid.contrib.StateCell.out_state (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '7ad681dff0393ddf13a724194e720f28')) +paddle.fluid.contrib.StateCell.set_state (ArgSpec(args=['self', 'state_name', 'state_value'], varargs=None, keywords=None, defaults=None), ('document', 'd4e0e08cd5d9d9a571cbc52d114f5ae9')) +paddle.fluid.contrib.StateCell.state_updater (ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None), ('document', 'd5afe1b7665d94fb023b15cf913ca510')) +paddle.fluid.contrib.StateCell.update_states (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'fe0b0f1338723516a35a30247899c81b')) +paddle.fluid.contrib.TrainingDecoder.__init__ (ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.TrainingDecoder.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '98d88fa1c989748410a12517c6a585bf')) +paddle.fluid.contrib.TrainingDecoder.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'f0a457dee586559036202087ce2eff69')) +paddle.fluid.contrib.TrainingDecoder.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', 'a024c72664fe815068423ba630b7658a')) +paddle.fluid.contrib.TrainingDecoder.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '4659db7a888a2495e71c1838a0483909')) +paddle.fluid.contrib.BeamSearchDecoder.__init__ (ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BeamSearchDecoder.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '98d88fa1c989748410a12517c6a585bf')) +paddle.fluid.contrib.BeamSearchDecoder.decode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '1e47c60f080c1343ebb6ceaef89656b2')) +paddle.fluid.contrib.BeamSearchDecoder.early_stop (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3a84a7454ed6707f79b9e954d92a7575')) +paddle.fluid.contrib.BeamSearchDecoder.read_array (ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'aa89eb8fd5e4cabaf5cc1bcae14665a4')) +paddle.fluid.contrib.BeamSearchDecoder.update_array (ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None), ('document', '5754e9b3212b7c09497151516a0de5a7')) +paddle.fluid.contrib.memory_usage (ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8fcb2f93bb743693baa8d4860a5ccc47')) +paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4d43687113c4bf5b29d15aee2f4e4afa')) +paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000)), ('document', '14b39f1fcd5667ff556b1aad94357d1d')) +paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd')) +paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884')) +paddle.fluid.contrib.Calibrator.__init__ (ArgSpec(args=['self'], varargs='args', keywords='kwargs', defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.Calibrator.sample_data (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3b8c85ca1e2cf753cc8c90a6c6992958')) +paddle.fluid.contrib.Calibrator.save_int8_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.reader.ctr_reader.ctr_reader (ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2ebf3de2a6ef1af2c3b88d2db7591ab')) +paddle.fluid.contrib.build_compressor (ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.CompressPass.__init__ (ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.CompressPass.add_strategy (ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None), ('document', '3bf6010b6f47d3c86df0ec8957be95e0')) +paddle.fluid.contrib.CompressPass.apply (ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None), ('document', 'a92bf85d4b59bd4f2ac1706d7c4899a6')) +paddle.fluid.contrib.ImitationGraph.__init__ (ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.ImitationGraph.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.SensitivePruneStrategy.__init__ (ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.MagnitudePruner.__init__ (ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.MagnitudePruner.prune (ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.RatioPruner.__init__ (ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a325b296a9ca502ee5adb4fc85d')) +paddle.fluid.contrib.RatioPruner.prune (ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)), ('document', '358cbf2978c91028fb96a195a9884645')) +paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '11fbf7e8dd2289805de291b453a33ee7')) +paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '5b5577bb3d24070da819674255d16196')) +paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4efbd93876832d4d35497cdbc7a1e6d8')) +paddle.fluid.contrib.HDFSClient.__init__ (ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.HDFSClient.delete (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', 'c3721aa2d4d9ef5a857dd47b2681c03e')) +paddle.fluid.contrib.HDFSClient.download (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'ca55bde92184d3fd0f9f5c963b25e634')) +paddle.fluid.contrib.HDFSClient.is_dir (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)), ('document', '45bde1bae02605a205c8245b58b9156d')) +paddle.fluid.contrib.HDFSClient.is_exist (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)), ('document', 'be9c94bccff7ba0c1d95883ac62b5864')) +paddle.fluid.contrib.HDFSClient.ls (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', '808acac504870c7e46594b95674f8a86')) +paddle.fluid.contrib.HDFSClient.lsr (ArgSpec(args=['self', 'hdfs_path', 'only_file', 'sort'], varargs=None, keywords=None, defaults=(True, True)), ('document', 'fae835aa3354eb6a0434c0f9ba3c2747')) +paddle.fluid.contrib.HDFSClient.make_local_dirs (ArgSpec(args=['local_path'], varargs=None, keywords=None, defaults=None), ('document', 'e76b89c8e7f019b5da576c0026fcf689')) +paddle.fluid.contrib.HDFSClient.makedirs (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', '44d9972aae390aedf40aaea731a37e4b')) +paddle.fluid.contrib.HDFSClient.rename (ArgSpec(args=['self', 'hdfs_src_path', 'hdfs_dst_path', 'overwrite'], varargs=None, keywords=None, defaults=(False,)), ('document', '0eb133644d9a9f4da45bb39261ff0955')) +paddle.fluid.contrib.HDFSClient.upload (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'retry_times'], varargs=None, keywords=None, defaults=(False, 5)), ('document', '7d053b4bfd6dcfdd2c9dda0e0dbd9665')) +paddle.fluid.contrib.multi_download (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)), ('document', '100927be598ed8f9eaa1f3ef1b23568a')) +paddle.fluid.contrib.multi_upload (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)), ('document', '183f34c83d30dbe16e09e8716c41958a')) +paddle.fluid.transpiler.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '292ab72977afbe58e6a3bde175452680')) +paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '78f4949aedf317666a89ca74b3748ba8')) +paddle.fluid.transpiler.DistributeTranspiler.get_startup_program (ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'd796fc0c8d51503b556fcf6dc15c4f0c')) +paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program (ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)), ('document', '736330e31a7a54abccc0c7fd9119d9ff')) +paddle.fluid.transpiler.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')), ('document', '06ce55338dfe96311ad1078235ab3bf4')) +paddle.fluid.transpiler.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)), ('document', 'eda17d0f1639bc6ca215cecf87f588a4')) +paddle.fluid.transpiler.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ac4114d3df16264f1946deb3a8434a6f')) +paddle.fluid.transpiler.HashName.__init__ (ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.HashName.dispatch (ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.HashName.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.RoundRobin.__init__ (ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.RoundRobin.dispatch (ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.RoundRobin.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ -paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)) -paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)) -paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) -paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) -paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)) -paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.optimizer.SGDOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) -paddle.fluid.optimizer.SGDOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) -paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) -paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) -paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) -paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)) -paddle.fluid.optimizer.AdamOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) -paddle.fluid.optimizer.AdamOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) -paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) -paddle.fluid.optimizer.AdamaxOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)) -paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) -paddle.fluid.optimizer.DecayedAdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)) -paddle.fluid.optimizer.FtrlOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) -paddle.fluid.optimizer.FtrlOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)) -paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) -paddle.fluid.optimizer.RMSPropOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)) -paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) -paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)) -paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)) -paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) -paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None) -paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)) -paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) -paddle.fluid.optimizer.LarsMomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)) -paddle.fluid.regularizer.L2DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)) +paddle.fluid.nets.simple_img_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)), ('document', 'e0f67f35abf27f666f81003113b90244')) +paddle.fluid.nets.sequence_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)), ('document', '48c434dd7bb827f69d90e5135d77470f')) +paddle.fluid.nets.glu (ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)), ('document', '08c1c57e1db6b20bf87b264cb7cf3ca8')) +paddle.fluid.nets.scaled_dot_product_attention (ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)), ('document', '921714c9bfb351b41403418265393203')) +paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)), ('document', '3802be78fbfb206dae64a2d9f8480970')) +paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.SGDOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.MomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.MomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.AdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.AdamOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdamOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.AdamaxOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.FtrlOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.FtrlOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.RMSPropOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.ModelAverage.__init__ (ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '46234a5470590feb336346f70a3db715')) +paddle.fluid.optimizer.ModelAverage.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '18db9c70be9c4dd466f9844457b21bfe')) +paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1a79bd7d10ae54ca763ec81bca36ba24')) +paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.regularizer.L2DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None 2. __init__(self: paddle.fluid.core.LoDTensor) -> None paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> bool paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]] paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]] paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None 2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None 3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None 4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None 5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None 6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None 7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None 8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CPUPlace) -> None 9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None 10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None 11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None 12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None 13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None 14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None 15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None 16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPlace) -> None 17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None 18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None 19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None 20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None 21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None 22. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None 23. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None 24. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPinnedPlace) -> None -paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None -paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None +paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, lod: List[List[int]]) -> None +paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, recursive_sequence_lengths: List[List[int]]) -> None paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int] paddle.fluid.LoDTensorArray.__init__ __init__(self: paddle.fluid.core.LoDTensorArray) -> None -paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core.LoDTensorArray, arg0: paddle.fluid.core.LoDTensor) -> None +paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core.LoDTensorArray, tensor: paddle.fluid.core.LoDTensor) -> None paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core.CPUPlace) -> None paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core.CUDAPlace, arg0: int) -> None paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinnedPlace) -> None -paddle.fluid.ParamAttr.__init__ ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)) -paddle.fluid.WeightNormParamAttr.__init__ ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)) -paddle.fluid.DataFeeder.__init__ ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.DataFeeder.decorate_reader ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)) -paddle.fluid.DataFeeder.feed ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DataFeeder.feed_parallel ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.clip.ErrorClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.clip.GradientClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.clip.GradientClipByNorm.__init__ ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None) -paddle.fluid.clip.GradientClipByGlobalNorm.__init__ ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',)) -paddle.fluid.profiler.cuda_profiler ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.profiler.reset_profiler ArgSpec(args=[], varargs=None, keywords=None, defaults=None) -paddle.fluid.profiler.profiler ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')) -paddle.fluid.profiler.start_profiler ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None) -paddle.fluid.profiler.stop_profiler ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')) -paddle.fluid.unique_name.generate ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None) -paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.unique_name.guard ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) -paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) +paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', '0eed2f198dc73c08a41b61edbc755753')) +paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', '459e316301279dfd82001b46f0b8ffca')) +paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '543863d1f9d4853758adb613b8659e85')) +paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.clip.GradientClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.clip.GradientClipByNorm.__init__ (ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.clip.GradientClipByGlobalNorm.__init__ (ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '2e2fb1cfc469a67f19fb578a2ed6be79')) +paddle.fluid.profiler.reset_profiler (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '397ce757fabbe5c622e0c3458c41fcd0')) +paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'bd3a07eeb68e384f4d2d416cb2e28d86')) +paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '88da8fb6dbebaee2f7520188a09574f9')) +paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'a7500e39dd033f1e64f562e909333a8a')) +paddle.fluid.unique_name.generate (ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.unique_name.switch (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310')) +paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7')) paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope -paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) -paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) -paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) -paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None) -paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None) -paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None) -paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)) -paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')) -paddle.reader.PipeReader.get_line ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')) -paddle.reader.multiprocess_reader ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)) -paddle.reader.Fake.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.reader.creator.np_array ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) -paddle.reader.creator.text_file ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None) -paddle.reader.creator.recordio ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)) +paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d')) +paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb')) +paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None), ('document', '884291104e1c3f37f33aae44b7deeb0d')) +paddle.reader.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4')) +paddle.reader.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d')) +paddle.reader.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad')) +paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '283bc0b8a0e26ae186b8b9bee4aec560')) +paddle.reader.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '5f80a7ed70052f01665e4c74acccfa69')) +paddle.reader.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0')) +paddle.reader.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.reader.creator.np_array (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '28d457fbc9a71efa4ac91a3be179cada')) +paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', '44fe286ab6175a5464d3a961a68c266a')) +paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', '11b3704ea42cfd537953387a7e58dae8')) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 7ddf1ab44fe096739f4d241994e5cb686970a7c5..ad19d729ebde4a9c81c283518f3cb2ac28152443 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -38,10 +38,10 @@ if(WITH_GPU) nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) add_dependencies(tensor tensor_util) else() - nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context ) + nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler) endif(WIN32) else() - cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context ) + cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler) endif() cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) @@ -174,7 +174,7 @@ else() cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() -target_link_libraries(executor garbage_collector) +target_link_libraries(executor garbage_collector while_op_helper) cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc index f537e4b9e569dd4c513ac0efde7240833bcf04b6..0b7aaf11746d1931e10ad7e5368d9e053092500e 100644 --- a/paddle/fluid/framework/block_desc.cc +++ b/paddle/fluid/framework/block_desc.cc @@ -13,7 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/block_desc.h" + #include +#include +#include + #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" @@ -155,6 +159,16 @@ void BlockDesc::RemoveOp(size_t s, size_t e) { ops_.erase(ops_.begin() + s, ops_.begin() + e); } +void BlockDesc::RemoveOpInternal(const OpDesc *op_desc) { + // TODO(minqiyang): make this faster + for (auto it = ops_.begin(); it != ops_.end(); ++it) { + if (it->get() == op_desc) { + ops_.erase(it); + break; + } + } +} + std::vector BlockDesc::AllOps() const { std::vector res; for (const auto &op : ops_) { diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h index 960ca39e1eadd3c064beb0e2c1342a406c4f0b6a..5c6e421516269a9b9865605400efa772f944a96f 100644 --- a/paddle/fluid/framework/block_desc.h +++ b/paddle/fluid/framework/block_desc.h @@ -93,6 +93,8 @@ class BlockDesc { */ void RemoveOp(size_t s, size_t e); + void RemoveOpInternal(const OpDesc *op_desc); + void RemoveVar(const std::string &name) { vars_.erase(name); } std::vector AllOps() const; diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 72c50518af08b9c1b2f97e6864e5836e806c77fc..10aa7a59422f4508dda8d0bcd960583056e25938 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -134,11 +134,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, out_layout = out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout; - auto& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = dynamic_cast( - pool.Get(expected_kernel_type.place_)); - auto& cpu_engine = dev_ctx->GetEngine(); - std::vector in_tz = paddle::framework::vectorize2int(in.dims()); std::vector out_tz = in_tz; @@ -147,29 +142,25 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, "Input tensor type is not supported: %s", in.type()); memory::data_type out_type = in_type; - auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format()); - auto out_format = - platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout)); - // output tensor has the same dims as input. Reorder don't change dims out->Resize(in.dims()); - if (in_format != out_format) { + // tempory mem pd fr out , to make reorder + auto out_mem_pd = paddle::platform::create_prim_desc_from_dims( + paddle::framework::vectorize2int(out->dims()), + mkldnn::memory::format::blocked, out_type); + if (in.get_mkldnn_prim_desc() != out_mem_pd) { void* in_data = GetDataFromTensor(in, in_type); auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); - auto in_memory = - memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data); - auto out_memory = - memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data); + auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data); + auto out_memory = memory(out_mem_pd, out_data); platform::Reorder(in_memory, out_memory); } else { out->ShareDataWith(in); } out->set_layout(out_layout); - // reset format since the out tensor will be feed to non-MKLDNN OPkernel - out->set_format(memory::format::format_undef); #endif } diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index 82872224501709080ff02a13464d58543a0abda8..f0203edf05635452bf347335066dadc24ecc3138 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -51,13 +51,31 @@ void TransformData(const OpKernelType &expected_kernel_type, #ifdef PADDLE_WITH_MKLDNN // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel // Just set layout/format. No real transform occur - - auto out_format = platform::MKLDNNFormatForSize(in.dims().size(), - ToMKLDNNFormat(lin)); - out.ShareDataWith(input_tensor); - out.set_layout(DataLayout::kMKLDNN); - out.set_format(out_format); + // TODO(jczaja): Remove that once all mkldnn ops + // are modified to work with mkldnn_blocked + auto mkldnn_fmt = [&](int rank) { + switch (rank) { + case 5: + return mkldnn::memory::format::ncdhw; + case 4: + return mkldnn::memory::format::nchw; + case 3: + return mkldnn::memory::format::ncw; + case 2: + return mkldnn::memory::format::nc; + case 1: + return mkldnn::memory::format::x; + default: + return mkldnn::memory::format::blocked; + } + }; + + auto out_mem_pd = paddle::platform::create_prim_desc_from_dims( + paddle::framework::vectorize2int(out.dims()), + mkldnn_fmt(out.dims().size())); + + out.set_mkldnn_prim_desc(out_mem_pd); #endif } else { // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index e88084424baf7eb5cd1d67ea19966866d71ec3eb..9f06455ea5410bcab081ed212a34960f8fe6f0bf 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -50,13 +50,19 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) -cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper) +if(WITH_GPU) +cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info) +else() +cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info) +endif() + cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) -cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) +cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle) +cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass) cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc index b7d6edd389d8e40835dadf56d7c54d53402f6f4d..ff223e616f7ef0c794e72a0028c7e5bb3f234ec0 100644 --- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc +++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc @@ -30,8 +30,6 @@ namespace paddle { namespace framework { namespace details { -static constexpr char kAllOpDescs[] = "all_op_descs"; - VarHandle* GetValidInput(const OpHandleBase* a) { for (auto p : a->Inputs()) { VarHandle* b = dynamic_cast(p); @@ -52,7 +50,7 @@ std::unique_ptr AllReduceDepsPass::ApplyImpl( std::unordered_map vars; // TODO(gongwb): use graph topology sort to find the order of operators. // Note that must assert topology sort is stable - auto& ops = Get>(kAllOpDescs); + auto& ops = graph->Get>(kStaleProgramOpDescs); for (auto* op_desc : ops) { auto outputs = op_desc->Outputs(); for (auto& o_it : outputs) { @@ -122,4 +120,4 @@ std::unique_ptr AllReduceDepsPass::ApplyImpl( REGISTER_PASS(all_reduce_deps_pass, paddle::framework::details::AllReduceDepsPass) - .RequirePassAttr(paddle::framework::details::kAllOpDescs); + .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs); diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index dd77f7099f581a5b825916c4ea010023f3ad5bcd..c1f9c2b60c915370df7793f26fe83812a7ced96d 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -53,7 +53,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); + platform::RecordEvent record_event(Name()); WaitInputVarGenerated(); auto in_var_handles = DynamicCast(this->Inputs()); diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 89d626edddfee3d2c43a3cf2232ad4fc1611e655..fdff83b92819b39974f3b2ce0848710f1ee02a41 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -22,7 +22,7 @@ namespace framework { namespace details { void BroadcastOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + platform::RecordEvent record_event(Name()); if (places_.size() == 1) return; @@ -30,7 +30,7 @@ void BroadcastOpHandle::RunImpl() { VarHandle *in_var_handle; { auto in_var_handles = DynamicCast(inputs_); - PADDLE_ENFORCE_EQ(in_var_handles.size(), 1, + PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL, "The number of input should be one."); in_var_handle = in_var_handles[0]; } diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index f8030c53f72bc8a6f007c1eb6a3072abd8037de2..2cfc76e47f41862731fb2de5d1d03287acd4d9d7 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -34,9 +34,11 @@ namespace details { static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) { // Should fix the allreduce op order if scheduling // them in multiple threads or processes to avoid hang. + // NOTE: ParallelGraph would execute this pass on each graph, so + // don't need to append it here. return (!strategy.enable_sequential_execution_ && - strategy.num_trainers_ > 1) || - strategy.enable_parallel_graph_; + strategy.num_trainers_ > 1) && + !strategy.enable_parallel_graph_; } class ParallelExecutorPassBuilder : public ir::PassBuilder { @@ -133,12 +135,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { void AppendMultiDevPass(const BuildStrategy &strategy) { ir::Pass *multi_devices_pass; if (strategy_.is_distribution_) { + VLOG(3) << "multi device parameter server mode"; multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); } else { if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + VLOG(3) << "multi devices collective mode with allreduce"; multi_devices_pass = AppendPass("allreduce_mode_multi_devices_pass").get(); } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { + VLOG(3) << "multi deivces collective mode with reduce"; multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get(); } else { PADDLE_THROW("Unknown reduce strategy."); @@ -169,7 +174,8 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const { } std::unique_ptr BuildStrategy::Apply( - const ProgramDesc &main_program, const std::vector &places, + std::unique_ptr graph, + const std::vector &places, const std::string &loss_var_name, const std::vector &local_scopes, const size_t &nranks, #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) @@ -180,7 +186,6 @@ std::unique_ptr BuildStrategy::Apply( // Create a default one if not finalized by user. CreatePassesFromStrategy(false); - std::unique_ptr graph(new ir::Graph(main_program)); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { if (IsMultiDevPass(pass->Type())) { pass->Erase(kPlaces); @@ -198,41 +203,12 @@ std::unique_ptr BuildStrategy::Apply( pass->Erase("nccl_ctxs"); pass->SetNotOwned("nccl_ctxs", nctx); #endif - } else if (pass->Type() == "memory_optimize_pass") { - if (graph->Has(kAllOpDescs)) { - graph->Erase(kAllOpDescs); - } - const std::vector *all_op_descs = - new std::vector(main_program.Block(0).AllOps()); - graph->Set>(kAllOpDescs, - all_op_descs); // take ownership - - pass->Erase(kAllOpDescs); - pass->SetNotOwned>(kAllOpDescs, all_op_descs); - } else if (pass->Type() == "sequential_execution_pass") { LOG(INFO) << "set enable_sequential_execution:" << enable_sequential_execution_; - - pass->Erase(kAllOpDescs); - pass->Set>( - kAllOpDescs, - new std::vector(main_program.Block(0).AllOps())); } else if (pass->Type() == "all_reduce_deps_pass") { LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this) << ", num_trainers:" << num_trainers_; - - pass->Erase(kAllOpDescs); - pass->Set>( - kAllOpDescs, - new std::vector(main_program.Block(0).AllOps())); - } else if (pass->Type() == "inplace_pass") { - if (graph->Has(kAllOpDescs)) { - graph->Erase(kAllOpDescs); - } - graph->Set>( - kAllOpDescs, - new std::vector(main_program.Block(0).AllOps())); } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") { if (!use_cuda) { LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on " @@ -240,7 +216,9 @@ std::unique_ptr BuildStrategy::Apply( continue; } } + VLOG(3) << "Start Apply Pass " << pass->Type(); graph = pass->Apply(std::move(graph)); + VLOG(3) << "Finish Apply Pass " << pass->Type(); } return graph; } diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index e62e3edcef710df739c53b5d848f5aceb4f2db4e..d755a2505aead37538bef2b01a193dba87dc1567 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include @@ -76,11 +77,11 @@ struct BuildStrategy { bool fuse_relu_depthwise_conv_{false}; - bool memory_optimize_{false}; + bool memory_optimize_{true}; // TODO(dzhwinter): // make enable_inplace, memory_optimize_ // memory_early_delete_ true by default - bool enable_inplace_{false}; + bool enable_inplace_{true}; bool enable_sequential_execution_{false}; @@ -114,7 +115,7 @@ struct BuildStrategy { // Apply the passes built by the pass_builder_. The passes will be // applied to the Program and output an ir::Graph. - std::unique_ptr Apply(const ProgramDesc &main_program, + std::unique_ptr Apply(std::unique_ptr graph, const std::vector &places, const std::string &loss_var_name, const std::vector &local_scopes, diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 1e3dbb1e44ecb16872e3bf4dee31e31cc69c9818..e98b16e6b3a07bfa0994295306e3bfa9e4174834 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include @@ -31,6 +32,8 @@ class ComputationOpHandle : public OpHandleBase { ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place, size_t scope_idx); + OperatorBase *GetOp() { return op_.get(); } + std::string Name() const override; const Scope *GetScope() const { return scope_; } diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc index 48dcc52623369f7b0f51cd8c8aeb198b37467d5f..c9b52b68205ade000e21a3d06b80af86cbe01f34 100644 --- a/paddle/fluid/framework/details/data_balance_op_handle.cc +++ b/paddle/fluid/framework/details/data_balance_op_handle.cc @@ -86,7 +86,7 @@ std::vector> DataBalanceOpHandle::GetBalancePlan( } void DataBalanceOpHandle::RunImpl() { - PADDLE_ENFORCE_GT(places_.size(), 1, + PADDLE_ENFORCE_GT(places_.size(), 1UL, "Data balance can only be enabled when the number of " "places to run larger than 1."); auto in_var_handles = DynamicCast(this->Inputs()); diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 03fbfd7f24a8a987db72f45be777acc7ece577a6..dbc90737f2286db6e74d3271f39d004c25e4a949 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -12,6 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include +#include + #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/scope.h" @@ -45,6 +49,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( } } #endif + PADDLE_ENFORCE(!var_names_.empty(), "Var names cannot be empty"); } EagerDeletionOpHandle::~EagerDeletionOpHandle() { @@ -60,15 +65,20 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } void EagerDeletionOpHandle::RunImpl() { - auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + Scope *exec_scope = nullptr; std::deque> garbages; for (auto &name : var_names_) { auto it = ref_cnts_->find(name); - // Var not found, not reference count has not decreased to 0 + // Reference count has not decreased to 0 if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) { continue; } + if (!exec_scope) { + exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + } + + // Var not found auto *var = exec_scope->FindVar(name); if (var == nullptr) { continue; diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index 4e42d0b4972d567dd769cad6ff8b9d45380ab77a..377bb915e0ce175d4e3fb74cb1ace21e5f46d9d8 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -12,20 +12,173 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include #include +#include #include #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" -#include "paddle/fluid/framework/details/eager_deletion_pass.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" +DEFINE_double(memory_fraction_of_eager_deletion, 1.0, + "Fraction of eager deletion. If less than 1.0, all variables in " + "the program would be sorted according to its memory size, and " + "only the FLAGS_memory_fraction_of_eager_deletion of the largest " + "variables would be deleted."); + namespace paddle { namespace framework { namespace details { +// op -> variables which can be deleted after op runs +using OpToVarNameSetMap = + std::unordered_map>; + +// Check whether the variable is LoDTensor based on static VarDesc info +static bool IsLoDTensor(VarDesc *var) { + return var->Proto()->type().type() == proto::VarType::LOD_TENSOR; +} + +// Get memory size of LoDTensor +static int64_t GetMemorySize( + const std::unordered_map> &vars, + const std::string &var_name) { + auto *var_desc = TryGetLatestVarDesc(vars.at(var_name)); + PADDLE_ENFORCE_NOT_NULL(var_desc); + PADDLE_ENFORCE(IsLoDTensor(var_desc)); + auto dims = var_desc->GetShape(); + return SizeOfType(var_desc->GetDataType()) * + std::accumulate(dims.begin(), dims.end(), static_cast(1), + std::multiplies()); +} + +// Split all variables in the graph into LoDTensor and Non-LoDTensor (e.g. +// SelectedRows, LoDTensorArray) +// Since partial GC is based on static analysis of memory size of each variable +// So we should skip SelectedRows and LoDTensorArray here +static void SplitIntoLoDTensorAndNonLoDTensorVars( + const OpToVarNameSetMap &m, const GraphVars &vars, + OpToVarNameSetMap *lod_tensors, OpToVarNameSetMap *other_vars) { + lod_tensors->clear(); + other_vars->clear(); + + for (auto &op_vars_pair : m) { + for (auto &var_name : op_vars_pair.second) { + auto *var_desc = TryGetLatestVarDesc( + vars[op_vars_pair.first->GetScopeIdx()].at(var_name)); + if (IsLoDTensor(var_desc)) { + (*lod_tensors)[op_vars_pair.first].insert(var_name); + } else { + (*other_vars)[op_vars_pair.first].insert(var_name); + } + } + } +} + +struct GCVarInfo { + GCVarInfo(const std::string &name, int64_t memory_size, + ComputationOpHandle *op, size_t scope_idx) + : name_(name), + memory_size_(memory_size), + op_(op), + scope_idx_(scope_idx) {} + + std::string name_; // variable name + int64_t memory_size_; // memory size + ComputationOpHandle *op_; // op after which the variable could be deleted + size_t scope_idx_; // scope index where the variable locates + + int64_t AbsMemorySize() const { return std::abs(memory_size_); } +}; + +// Delete delete_lod_tensor_only is not used currently +static OpToVarNameSetMap ShrinkGCVars( + const OpToVarNameSetMap &m, const GraphVars &vars, + const std::vector &places, double fraction_of_memory_size, + bool delete_lod_tensor_only = false) { + // Do not perform gc when fraction_of_memory_size = 0 + if (fraction_of_memory_size <= 0.0) return {}; + + /** + * Step 1: Split all variables into LoDTensor and Non-LoDTensor. + * We can only calculate memory size of LoDTensors + */ + OpToVarNameSetMap lod_tensors, other_vars; + SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars); + + // Perform complete gc when fraction_of_memory_size >= 1 + if (fraction_of_memory_size >= 1.0) { + return delete_lod_tensor_only ? lod_tensors : m; + } + + /** + * Step 2: build GCVarInfos, and calculate total memory sizes of each device + */ + + // place -> variable info (name, memory size, place, scope_idx) + std::map> place_to_vars; + + // place -> total memory sizes + std::map place_to_size; + for (auto &op_vars_pair : lod_tensors) { + auto *op = op_vars_pair.first; + auto &var_names = op_vars_pair.second; + auto scope_idx = op->GetScopeIdx(); + auto &place = places[scope_idx]; + + for (auto &var_name : var_names) { + auto var_size = GetMemorySize(vars[scope_idx], var_name); + GCVarInfo var_info(var_name, var_size, op, scope_idx); + place_to_size[place] += var_info.AbsMemorySize(); + place_to_vars[place].emplace_back(std::move(var_info)); + } + } + + /** + * Step 3: sort GCVarInfos, and only delete the largest variables. + */ + OpToVarNameSetMap partial_vars; + for (auto &place_to_var_pair : place_to_vars) { + auto &place = place_to_var_pair.first; + auto &gc_vars = place_to_var_pair.second; + std::sort(gc_vars.begin(), gc_vars.end(), + [](const GCVarInfo &var1, const GCVarInfo &var2) { + return var1.AbsMemorySize() > var2.AbsMemorySize(); + }); + + int64_t accumulated_size = 0; + int64_t size_threshold = + static_cast(fraction_of_memory_size * place_to_size[place]); + for (size_t i = 0; i < gc_vars.size() && accumulated_size < size_threshold; + ++i) { + partial_vars[gc_vars[i].op_].insert(gc_vars[i].name_); + accumulated_size += gc_vars[i].AbsMemorySize(); + } + } + + /** + * Step 4: Combine other vars (SelectedRows, LoDTensorArray) + */ + if (!delete_lod_tensor_only) { + for (auto &op_vars_pair : other_vars) { + partial_vars[op_vars_pair.first].insert(op_vars_pair.second.begin(), + op_vars_pair.second.end()); + } + } + + return partial_vars; +} + +class EagerDeletionPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + std::unique_ptr EagerDeletionPass::ApplyImpl( std::unique_ptr graph) const { auto &ref_cnts = @@ -43,9 +196,7 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( // a reverse map of last_live_ops // i.e., last op --> variable names which can be deleted. - std::unordered_map> - op_vars_map; - + OpToVarNameSetMap op_vars_map; for (auto &var_ops_map : last_live_ops) { for (auto &var_ops_pair : var_ops_map) { const std::string &var_name = var_ops_pair.first; @@ -55,6 +206,9 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( } } + op_vars_map = ShrinkGCVars(op_vars_map, vars, places, + FLAGS_memory_fraction_of_eager_deletion); + for (auto &pair : op_vars_map) { auto *op = pair.first; auto &var_names = pair.second; @@ -85,8 +239,13 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( eager_deletion_op->AddOutput(dummy_leaf); } + VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = " + << FLAGS_memory_fraction_of_eager_deletion; VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)"; - return graph; + + auto while_op_eager_deletion_pass = + ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass"); + return while_op_eager_deletion_pass->Apply(std::move(graph)); } } // namespace details @@ -99,3 +258,5 @@ REGISTER_PASS(eager_deletion_pass, .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars) .RequirePassAttr(paddle::framework::details::kAllPlaces) .RequirePassAttr(paddle::framework::details::kGarbageCollector); + +USE_PASS(while_op_eager_deletion_pass); diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 872bc5d654cd66db821e56031d878815b653645c..d4fbea9d95118666ababde811867e95c657c07de 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" +#include #include +#include #include #include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" @@ -24,12 +26,11 @@ namespace details { FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, - const std::vector &places, - std::unique_ptr &&graph) + const std::vector &places, ir::Graph *graph) : strategy_(strategy), local_scopes_(local_scopes), places_(places), - graph_(std::move(graph)), + graph_(graph), pool_(strategy.num_threads_), prepare_pool_(1), // add one more thread for generate op_deps fetch_ctxs_(places) { @@ -56,7 +57,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( std::vector fetch_ops; for (auto &fetch_var_name : fetch_tensors) { - for (auto &var_map : graph_->Get("vars")) { + for (auto &var_map : graph_->Get(details::kGraphVars)) { auto it = var_map.find(fetch_var_name); if (it != var_map.end()) { fetched_vars[fetch_var_name].push_back(*it->second.rbegin()); @@ -110,14 +111,14 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( } } if (exception_.IsCaught()) { - ClearFetchOp(graph_.get(), &fetch_ops); + ClearFetchOp(graph_, &fetch_ops); exception_.ReThrow(); } } num_complete += num_comp; } // Wait FetchOps. - ClearFetchOp(graph_.get(), &fetch_ops); + ClearFetchOp(graph_, &fetch_ops); return fetches; } diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h index c3a8b85423403992e3a12ceb0a1acbae82d25dfa..970298950cc8089bc5861fcbf8dc2544934b181f 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h @@ -32,7 +32,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::unique_ptr &&graph); + ir::Graph *graph); FeedFetchList Run(const std::vector &fetch_tensors) override; const ir::Graph &Graph() const override; @@ -40,7 +40,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { ExecutionStrategy strategy_; std::vector local_scopes_; std::vector places_; - std::unique_ptr graph_; + ir::Graph *graph_; std::unordered_map op_deps_; std::vector bootstrap_ops_; diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc index d65b0920698748e8a2ded728d78fbcd69b7bae0e..14292c0a5d06aa3ff12b46b5768b136fa925752d 100644 --- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc +++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc @@ -23,7 +23,7 @@ void FuseVarsOpHandle::RunImpl() { auto in_var_handles = DynamicCast(this->Inputs()); auto out_var_handles = DynamicCast(this->Outputs()); - PADDLE_ENFORCE_EQ(in_var_handles.size(), 0); + PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL); PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), ""); auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get(); diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc index 51dfa2d0711f49aaefab0af3549283dbf77eee4a..f48561ea32e6a3bbc7e9f2a8326b080ad21c6d61 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc @@ -22,7 +22,7 @@ namespace framework { namespace details { void FusedBroadcastOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + platform::RecordEvent record_event(Name()); if (places_.size() == 1UL) return; diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc index b0c5968499be3a959dd6103424f25056a6dc2282..8d4717ad19d4ca0525eac4d1a0dfe6d0076a8c09 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.cc +++ b/paddle/fluid/framework/details/inplace_op_pass.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -49,7 +50,7 @@ DEFINE_bool( "If this option turns on, only these op in whitelist can be inplaced." "If it turns off, all of the running op can be candidate of inplaced op." "Such as scale, elementwise_add" - "By default, it's turned on"); + "By default, it's turned off"); DECLARE_string(memory_optimize_debug); @@ -263,6 +264,10 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes, void InplacePass::TryInplaceOpInputOutput(ir::Node* op, ir::Graph* graph) const { VLOG(4) << "Try to inplace op " << op->Name(); + // FIXME(liuwei1031): Graph is not aware of the existence of BlockDescs and + // ProgramDescs. + // The operations related to BlockDesc or ProgramDesc should perform on Graph + // or Node directly! PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr, "op_desc is nullptr"); // some pre-requirments need to meet if the op want to inplaced. diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc index 6345ba335997ec42ebc63f90e9bf6a3ed2648edc..c89a33fc959247afb74dab49056fc3fca8b9bd89 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper.cc @@ -13,13 +13,22 @@ // limitations under the License. #include "paddle/fluid/framework/details/memory_optimize_helper.h" +#include #include #include -#include +#include #include #include #include +#include +#include +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/cpu_info.h" + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/gpu_info.h" +#endif // PADDLE_WITH_CUDA namespace paddle { namespace framework { @@ -27,10 +36,10 @@ namespace details { using paddle::framework::VarDesc; std::vector SortOpLikeDescOrder(const ir::Graph& graph) { - PADDLE_ENFORCE(graph.Has(kAllOpDescs), - "Graph has no attribute of kAllOpDescs."); + PADDLE_ENFORCE(graph.Has(kStaleProgramOpDescs), + "Graph has no attribute of kStaleProgramOpDescs."); // 1. get op desc order - auto& op_descs = graph.Get>(kAllOpDescs); + auto& op_descs = graph.Get>(kStaleProgramOpDescs); // 2. topology sort order auto nodes = graph.Nodes(); @@ -123,7 +132,13 @@ size_t NodeSize(const VarDesc& node) { } size_t NodeSize(ir::Node* n) { - auto* desc = FindVarDescInBlock(n); + VarDesc* desc = nullptr; + // some op do not have block pointer + if (n->inputs[0]->Op() != nullptr) { + desc = FindVarDescInBlock(n); + } else { + desc = n->Var(); + } return NodeSize(*desc); } @@ -166,6 +181,11 @@ struct NodeComparator { bool operator()(ir::Node* lhs, ir::Node* rhs) const { auto* lhs_desc = FindVarDescInBlock(lhs); auto* rhs_desc = FindVarDescInBlock(rhs); + // match data type + if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) { + return false; + } + // match shape auto lhs_shape = lhs_desc->GetShape(); auto rhs_shape = rhs_desc->GetShape(); if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || @@ -230,6 +250,27 @@ ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const { return found_node; } +ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const { + ir::Node* found_node = nullptr; + NodeComparator functor; + auto it = + std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) { + if (v.front() == prev) + return true; + else + return false; + }); + PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!"); + for (it = std::next(it); it != nodes_.end(); ++it) { + auto& candidate = it->front(); + if (functor(var, candidate)) { + found_node = candidate; + break; + } + } + return found_node; +} + bool OrderedSet::Has(ir::Node* var) const { if (mark_table_.count(var->Name())) { auto& node_in_samename = mark_table_.at(var->Name()); @@ -241,10 +282,15 @@ bool OrderedSet::Has(ir::Node* var) const { return false; } +void OrderedSet::Erase(const std::string& var) { + PADDLE_ENFORCE(mark_table_.count(var)); + nodes_.erase(mark_table_[var]); + mark_table_.erase(var); +} + void OrderedSet::Erase(ir::Node* var) { - PADDLE_ENFORCE(mark_table_.count(var->Name())); - nodes_.erase(mark_table_[var->Name()]); - mark_table_.erase(var->Name()); + PADDLE_ENFORCE(var != nullptr); + Erase(var->Name()); } std::string OrderedSet::ToString() const { @@ -259,7 +305,10 @@ std::string OrderedSet::ToString() const { bool NodeCanReused(ir::Node* node) { // valid the node is a var node - if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false; + // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad + if (node == nullptr || !node->IsVar() || node->IsCtrlVar() || + node->Name() == kEmptyVarName) + return false; bool flag = true; // op output force generated in cpu, can not be reused. @@ -274,20 +323,37 @@ bool NodeCanReused(ir::Node* node) { return flag; } +int MinChunkSize() { + int size{0}; +#ifdef PADDLE_WITH_CUDA + size = platform::GpuMinChunkSize(); +#else + size = platform::CpuMinChunkSize(); +#endif // PADDLE_WITH_CUDA + return size; +} + bool NodeCanReused(const VarDesc& node) { auto type = node.GetType(); + // only these types holds bulk of gpu memory if (!(type == proto::VarType::LOD_TENSOR || type == proto::VarType::SELECTED_ROWS || type == proto::VarType::LOD_TENSOR_ARRAY)) { return false; } - if (node.Persistable() || node.GetShape().empty()) { + // persistable variable is parameter + if (node.Persistable()) { return false; } - // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad - std::string name = node.Name(); - if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') + // shape < min_chunk_size is meaningless. + // further more, fetched loss always has size = 1 + // which should not be reused. + auto shape = node.GetShape(); + int size = std::abs( + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies())); + if (shape.empty() || size < MinChunkSize()) { return false; + } return true; } @@ -397,11 +463,21 @@ void ControlFlowGraph::LiveVariableAnalysis() { } } } + + for (auto* op : ops_) { + unlived_vars_[op] = std::set(); + for (auto& var : this->LiveIn(op)) { + if (!this->LiveOut(op).count(var)) { + unlived_vars_[op].insert(var); + } + } + } } void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, const std::string& new_node, int begin_idx) { + std::vector need_update(ops_.size(), false); // update graph from begin idx to the end for (size_t i = begin_idx; i != ops_.size(); ++i) { auto* op = ops_[i]; @@ -416,15 +492,27 @@ void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, if (live_in_[op].find(old_node) != live_in_[op].end()) { live_in_[op].erase(old_node); live_in_[op].insert(new_node); + need_update[i] = true; } if (live_out_[op].find(old_node) != live_out_[op].end()) { live_out_[op].erase(old_node); live_out_[op].insert(new_node); + need_update[i] = true; + } + } + + for (size_t i = begin_idx; i < ops_.size(); ++i) { + if (!need_update[i]) continue; + auto* op = ops_[i]; + for (auto& var : this->LiveIn(op)) { + if (!this->LiveOut(op).count(var)) { + unlived_vars_[op].insert(var); + } } } } -const std::set ControlFlowGraph::LiveIn(ir::Node* op) const { +const std::set& ControlFlowGraph::LiveIn(ir::Node* op) const { auto it = live_in_.find(op); PADDLE_ENFORCE( it != live_in_.end(), @@ -432,7 +520,7 @@ const std::set ControlFlowGraph::LiveIn(ir::Node* op) const { return it->second; } -const std::set ControlFlowGraph::LiveOut(ir::Node* op) const { +const std::set& ControlFlowGraph::LiveOut(ir::Node* op) const { auto it = live_out_.find(op); PADDLE_ENFORCE( it != live_out_.end(), @@ -440,15 +528,24 @@ const std::set ControlFlowGraph::LiveOut(ir::Node* op) const { return it->second; } -const std::set ControlFlowGraph::Use(ir::Node* op) const { +const std::set& ControlFlowGraph::Use(ir::Node* op) const { auto it = uses_.find(op); PADDLE_ENFORCE( it != uses_.end(), - string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + string::Sprintf("Expect %s in use, but Not Found.", op->Name())); + return it->second; +} + +const std::set& ControlFlowGraph::Unlived(ir::Node* op) const { + auto it = unlived_vars_.find(op); + PADDLE_ENFORCE( + it != unlived_vars_.end(), + string::Sprintf("Expect %s in unlived_set, but Not Found.", op->Name())); + return it->second; return it->second; } -const std::vector ControlFlowGraph::Ops() const { return ops_; } +const std::vector& ControlFlowGraph::Ops() const { return ops_; } std::vector& ControlFlowGraph::Ops() { return ops_; } @@ -461,7 +558,9 @@ ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name, for (auto* node : ops_) { if (node == op) break; for (auto& output : node->outputs) { - if (output->Name() == name) { + PADDLE_ENFORCE((output != nullptr && output->IsVar()), + "Output is empty!"); + if (output->Var() && output->Name() == name) { found_node = output; } } diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h index 0bfaf827fea84030de48a9984197f5b39f5c9261..b5348cc66eaa446719b299b63caa340eab3e2ab9 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.h +++ b/paddle/fluid/framework/details/memory_optimize_helper.h @@ -29,8 +29,6 @@ namespace paddle { namespace framework { namespace details { -constexpr char kAllOpDescs[] = "all_op_descs"; - std::vector SortOpLikeDescOrder(const ir::Graph& graph); // NOTE(dzh): A ordered set for node reuse in memory optimize. @@ -55,6 +53,7 @@ class OrderedSet { void Insert(ir::Node* var); void Erase(ir::Node* var); + void Erase(const std::string& var); bool Has(ir::Node* var) const; void Clear() { mark_table_.clear(); @@ -62,6 +61,7 @@ class OrderedSet { } // find the bestfit shape node block with var. ir::Node* FindBestFitNode(ir::Node* var) const; + ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const; // map store non-const iterator, can not promise const int GetNodeIndexInPool(ir::Node* var); // pool all node to string @@ -92,10 +92,11 @@ class ControlFlowGraph { void RenameVarInCFGGraph(const std::string& old_node, const std::string& new_node, int begin_idx); - const std::set LiveIn(ir::Node* op) const; - const std::set LiveOut(ir::Node* op) const; - const std::set Use(ir::Node* op) const; - const std::vector Ops() const; + const std::set& LiveIn(ir::Node* op) const; + const std::set& LiveOut(ir::Node* op) const; + const std::set& Use(ir::Node* op) const; + const std::set& Unlived(ir::Node* op) const; + const std::vector& Ops() const; std::vector& Ops(); // for ssa-graph nodes @@ -117,6 +118,7 @@ class ControlFlowGraph { VarSetMap live_out_; VarSetMap uses_; // op inputs VarSetMap defs_; // op outputs + std::unordered_map> unlived_vars_; std::vector ops_; // op sequence by topology sort }; diff --git a/paddle/fluid/framework/details/memory_optimize_helper_test.cc b/paddle/fluid/framework/details/memory_optimize_helper_test.cc index 5c13dda9e5491044d2bcbed4b24d438cc8b8a413..5389e76e0c65c7c0ee23004ca1b0a56efb4c54fe 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc @@ -107,6 +107,52 @@ TEST(OrderedSet, Normal) { ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2] } } + +TEST(OrderedSet, FindBestFitNode) { + OrderedSet pool; + std::vector> nodes; + ProgramDesc prog; + BlockDesc* block_desc = prog.MutableBlock(0); + auto* op_desc = block_desc->AppendOp(); + op_desc->SetType("dummy"); + std::unique_ptr op = ir::CreateNodeForTest(op_desc); + + { + auto desc = block_desc->Var("a"); + desc->SetShape({128, 128}); + std::unique_ptr node = ir::CreateNodeForTest(desc); + node->inputs.emplace_back(op.get()); + nodes.emplace_back(std::move(node)); + } + { + auto desc = block_desc->Var("b"); + desc->SetShape({128, 129}); + std::unique_ptr node = ir::CreateNodeForTest(desc); + node->inputs.emplace_back(op.get()); + nodes.emplace_back(std::move(node)); + } + { + auto desc = block_desc->Var("c"); + desc->SetShape({128, 128}); + std::unique_ptr node = ir::CreateNodeForTest(desc); + node->inputs.emplace_back(op.get()); + nodes.emplace_back(std::move(node)); + } + + for (auto& node : nodes) { + pool.Insert(node.get()); + } + + // FindNextBestFitNode + auto* n = nodes[0].get(); + auto* cache = pool.FindBestFitNode(n); + PADDLE_ENFORCE(cache->Name() == "a"); + cache = pool.FindNextBestFitNode(n, cache); + PADDLE_ENFORCE(cache->Name() == "c"); + cache = pool.FindNextBestFitNode(n, cache); + PADDLE_ENFORCE(cache->Name() == "b"); +} + } // namespace details } // namespace framework } // namespace paddle @@ -182,9 +228,6 @@ TEST(CFGGraph, IRGraph) { // prepare ir graph auto prog = FillProgramDesc(); ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership ControlFlowGraph cfg(graph); cfg.LiveVariableAnalysis(); @@ -210,9 +253,6 @@ TEST(CFGGraph, IRGraph) { TEST(SortOpLikeDescOrder, NormalTest) { auto prog = FillProgramDesc(); ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership auto nodes = SortOpLikeDescOrder(graph); auto op_descs = prog.Block(0).AllOps(); @@ -227,9 +267,6 @@ TEST(SortOpLikeDescOrder, NormalTest) { TEST(SortOpLikeDescOrder, RemoveOpDesc) { auto prog = FillProgramDesc(); ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership auto nodes = graph.Nodes(); auto op_descs = prog.Block(0).AllOps(); ir::Node* found_node = nullptr; @@ -278,8 +315,6 @@ TEST(SortOpLikeDescOrder, RemoveOpDesc) { // 3. add some op_desc TEST(SortOpLikeDescOrder, AddOpDesc) { auto prog = FillProgramDesc(); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); ir::Graph graph(prog); auto find_node_in_graph = [&](std::string s) { @@ -296,9 +331,7 @@ TEST(SortOpLikeDescOrder, AddOpDesc) { // cached desc different with real one // mimic the intermidiete pass modify the programdesc. - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership - - auto op_descs = prog.Block(0).AllOps(); + std::vector op_descs = graph.OriginProgram().Block(0).AllOps(); auto op = prog.MutableBlock(0)->AppendOp(); prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); @@ -330,9 +363,6 @@ TEST(SortOpLikeDescOrder, AddOpDesc) { TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { auto prog = FillProgramDesc(); ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership auto find_node_in_graph = [&](std::string s) { ir::Node* ret = nullptr; @@ -346,8 +376,9 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { return ret; }; + std::vector op_descs = graph.OriginProgram().Block(0).AllOps(); + // remove sum node - auto op_descs = prog.Block(0).AllOps(); ir::Node* found_node = nullptr; auto nodes = graph.Nodes(); for (auto node : nodes) { @@ -408,9 +439,7 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { auto prog = FillProgramDesc(); ir::Graph graph(prog); - const std::vector* all_op_descs = - new std::vector(prog.Block(0).AllOps()); - graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + std::vector op_descs = graph.OriginProgram().Block(0).AllOps(); auto find_node_in_graph = [&](std::string s) { ir::Node* ret = nullptr; @@ -424,7 +453,6 @@ TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { return ret; }; - auto op_descs = prog.Block(0).AllOps(); // add node auto op = prog.MutableBlock(0)->AppendOp(); prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc index 41e4a834df0abab069ab1f6cd21c8479b911250c..80720af32d5670928a6ad2b9efbeadf6452b0273 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.cc +++ b/paddle/fluid/framework/details/memory_optimize_pass.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "gflags/gflags.h" #include "paddle/fluid/framework/data_type.h" @@ -69,58 +70,60 @@ std::unique_ptr MemoryOptimizePass::ApplyImpl( } for (auto& var : op->outputs) { - if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 || - skip_set_.count(var->Name())) + if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) { + VLOG(3) << "Skip set contains variable of " << var->Name() + << "disable reuse on it. skipped"; continue; - ir::Node* cache = pool_.FindBestFitNode(var); - - if (var->Name() == FLAGS_memory_optimize_debug) { - VLOG(3) << "start match var " << DebugString(var) << " of op " - << op->Name(); - VLOG(3) << pool_.ToString(); - VLOG(3) << "matched in pool : " - << ((cache == nullptr) ? "False" : "True"); } + if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) { + ir::Node* cache = pool_.FindBestFitNode(var); + while (cache != nullptr && var->Name() == cache->Name()) { + VLOG(3) << "The same cache variable is cascade reused. " + << cache->Name() << " is re-filled to the pool after " + << "the reused op is finished. Current op can not " + << "replace it again. Skip this candidate."; + cache = pool_.FindNextBestFitNode(var, cache); + } + if (var->Name() == FLAGS_memory_optimize_debug) { + VLOG(3) << "start match var " << DebugString(var) << " of op " + << op->Name(); + VLOG(3) << pool_.ToString(); + VLOG(3) << "matched in pool : " + << ((cache == nullptr) ? "False" : "True"); + } - if (cache == nullptr) continue; - if (var->Name() == cache->Name()) { - VLOG(3) << "The same cache variable is cascade reused." << var->Name() - << " is re-filled to the pool after" - << "the reused op is finished. Current op can not " - << "replace it again. Skip this candidate."; - continue; - - int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); - VLOG(3) << string::Sprintf( - "!!! %s, %s => %s, cache idx %d, pool size %d", - std::to_string(reuse_id++), DebugString(var), DebugString(cache), - node_idx_in_pool, static_cast(pool_.size())); - - // update CFG Graph on the fly. - // reused var maybe re-fill into the pool - cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); - // NOTE(dzhwinter): we need to both update the ProgramDesc - // and IR Graph. because op_desc/var_desc is used in CreateOp, - // CreateVar when running happens. But IR Graph - // define the dependence relationship between nodes. - RenameVarInGraphDesc(var->Name(), cache->Name(), idx); - RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get()); - - pool_.Erase(cache); - } + if (cache != nullptr) { + int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(reuse_id++), DebugString(var), DebugString(cache), + node_idx_in_pool, static_cast(pool_.size())); + // NOTE(dzhwinter): update the ProgramDesc/IR Graph + // and the CFG Graph on the fly. + // + // IR Graph define the dependence relationship between nodes. + // + // ProgramDesc defines the input/output vars. Its used in + // CreateOp, CreateVar when running happens. + // + // CFG Graph store the liveness information, when reuse happens + // we also need to update the variable liveness. + const std::string var_name = var->Name(); + const std::string cache_name = cache->Name(); - // fill the pool - std::unordered_set unlived_vars; - for (auto var : cfg_->LiveIn(op)) { - if (cfg_->LiveOut(op).count(var) == 0) { - unlived_vars.emplace(var); + cfg_->RenameVarInCFGGraph(var_name, cache_name, idx); + RenameVarInGraphDesc(var_name, cache_name, idx); + RenameVarInGraphNode(var_name, cache_name, idx, graph.get()); + pool_.Erase(cache_name); } } - for (auto var : unlived_vars) { - ir::Node* var_node = cfg_->GetNodeByName(var, op); - if (NodeCanReused(var_node) && !pool_.Has(var_node)) { - pool_.Insert(var_node); - } + } + // fill the pool + for (auto& var : cfg_->Unlived(op)) { + ir::Node* var_node = cfg_->GetNodeByName(var, op); + if (var_node == nullptr || var_node->IsCtrlVar()) continue; + if (NodeCanReused(var_node) && !pool_.Has(var_node)) { + pool_.Insert(var_node); } } } @@ -189,8 +192,13 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { // immediately to make the subblock variable reuse strategy take // effect. Because it is a single op in graph. No need to // update the ir nodes. + // FIXME(liuwei1031): Graph is not aware of the existence of + // BlockDescs and ProgramDescs. + // The operations related to BlockDesc or ProgramDesc should perform + // on Graph or Node directly! sub_op_desc->Rename(var->Name(), cache->Name()); - if (sub_op_desc->Block()->HasVar(var->Name())) { + if (sub_op_desc->Block() != nullptr && + sub_op_desc->Block()->HasVar(var->Name())) { sub_op_desc->Block()->RemoveVar(var->Name()); } } @@ -231,7 +239,13 @@ void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, auto* op_desc = op->Op(); op_desc->RenameInput(var, cache_var); op_desc->RenameOutput(var, cache_var); - if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var); + if (op_desc->Block() != nullptr) { + op_desc->Block()->RemoveVar(var); + } else { + LOG(WARNING) << "op " << op->Name() << " not know its block." + << "Is the op_desc created without block pointer? " + << "Can not find " << var << " in Block(0)"; + } op_desc->Flush(); } } @@ -273,8 +287,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, // redirect the input to the latest version of cache_var for (auto* node : op->inputs) { if (node->Name() == var) { - ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); - var_nodes_[cache_var].emplace_back(cache_node); + ir::Node* cache_node = var_nodes_[cache_var].back(); // swap node to cache_node cache_node->outputs.insert(cache_node->outputs.end(), @@ -283,11 +296,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, auto* prev_op = node->inputs[0]; std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, cache_node); - cache_node->inputs.emplace_back(prev_op); for (auto* next_op : node->outputs) { std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, cache_node); } + + // erase unused node + auto& nodes = var_nodes_.at(var); + nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end()); + graph->RemoveNode(node); } } @@ -307,15 +324,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, cache_node); } + + // erase unused node + auto& nodes = var_nodes_.at(var); + nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end()); + graph->RemoveNode(node); } } } - - // release node of unused var in graph - for (auto* node : var_nodes_[var]) { - graph->RemoveNode(node); - } - var_nodes_.at(var).clear(); } } // namespace details @@ -324,4 +340,4 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, REGISTER_PASS(memory_optimize_pass, paddle::framework::details::MemoryOptimizePass) - .RequireGraphAttr(paddle::framework::details::kAllOpDescs); + .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 75f922d2cca6855a67be7284ae407e549a1a1afb..478d2ffbcf2988487893984284d4597f018f0ca0 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -392,20 +392,32 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result, void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( ir::Graph *result, const std::string &og) const { + OpHandleBase *op_handle = nullptr; + + auto append_allreduce_op = [&]( + const std::vector &scopes, + const std::vector &places) -> OpHandleBase * { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( - result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), - local_scopes_, places_, nccl_ctxs_)); + result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( + result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), + scopes, places, nccl_ctxs_)); #else - result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( - result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), - local_scopes_, places_)); + result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( + result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), + scopes, places)); #endif - auto *op_handle = result->Get(kGraphOps).back(); + return result->Get(kGraphOps).back(); + }; + + if (!strategy_.enable_parallel_graph_) + op_handle = append_allreduce_op(local_scopes_, places_); for (size_t i = 0; i < places_.size(); ++i) { - auto &p = places_[i]; - SetCommunicationContext(op_handle, p); + if (strategy_.enable_parallel_graph_) { + op_handle = append_allreduce_op({local_scopes_[i]}, {places_[i]}); + } + + SetCommunicationContext(op_handle, places_[i]); auto &vars = result->Get(kGraphVars)[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); @@ -413,7 +425,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( auto var = new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable), - vars.size(), i, og, p); + vars.size(), i, og, places_[i]); vars.emplace_back(var); op_handle->AddOutput(var); } @@ -925,9 +937,21 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, } void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { - if (need_broadcast_var_ || - (UseGPU() && - strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce)) { + // broad cast received parameters when training in parameter server mode. + if (need_broadcast_var_) { + // There are 4 conditions: + // 1. GPU && Reduce: Reduce gradient then broadcast gradient to other GPUS. + // Need to broadcast received parameters to other GPU. + // 2. GPU && AllReduce: AllReduce all graident to each GPU. Need to + // broadcast received parameters to other GPU. + // 3. CPU && AllReduce: AllReduce all gradient to each thread. Need to + // broadcast received parameters to other scope. + // 4. CPU && Reduce: because all parameters share the same memory, did not + // broadcast received parameters. + if (!UseGPU() && + strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { + return; + } if (strategy_.fuse_broadcast_op_) { CreateFusedBroadcastOp(result, bcast_var_name_set_); } else { diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h index 1a2b75fbc0c28984ce5cf00e0a2ce0f804349bb1..9afbb91005c9c3a9d2e185f4dfa901ebf812ee19 100644 --- a/paddle/fluid/framework/details/multi_devices_helper.h +++ b/paddle/fluid/framework/details/multi_devices_helper.h @@ -36,13 +36,14 @@ namespace details { // map from variable name to variables. The variables, who have the same name, // will have a differsent version. The offset in the // `std::vector` is the version of varaibles. -typedef std::vector>> +typedef std::vector>> GraphVars; const char kGraphVars[] = "vars"; // aux variables to represent dependency. Useful to resolve data hazard. -typedef std::unordered_set GraphDepVars; +typedef std::unordered_set GraphDepVars; const char kGraphDepVars[] = "dep_vars"; + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index b1a82e8771b92f2d0af4a1c7732ff2da54d496a8..e0aa352e95bc3685a1f4879bffa6e86eecd7e7f9 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -70,6 +70,9 @@ class OpHandleBase { auto it = dev_ctxes_.find(place); return it != dev_ctxes_.end() ? it->second : nullptr; } + const std::map &DeviceContext() { + return dev_ctxes_; + } void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) { dev_ctxes_[place] = ctx_; diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index e8deb5bfc6c013addce11e2a04286a828acc1930..2afac32437dd79a54ef7d1ee2d203a34c1b5f30e 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -13,22 +13,92 @@ // limitations under the License. #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" +#include +#include +#include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { namespace framework { namespace details { +std::vector> +ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) { + std::vector> graphs; + graphs.reserve(places_.size()); + for (size_t i = 0; i < places_.size(); ++i) { + ProgramDesc empty; + graphs.emplace_back(std::unique_ptr(new ir::Graph(empty))); + auto &g = graphs.back(); + g->Set(kGraphVars, new GraphVars(1UL)); + g->Set(kGraphDepVars, new GraphDepVars); + auto &stale_ops = + graph->Get>(details::kStaleProgramOpDescs); + g->Erase(details::kStaleProgramOpDescs); + g->Set>(details::kStaleProgramOpDescs, + new std::vector(stale_ops)); + } + auto op_handles = ir::FilterByNodeWrapper(*graph); + + for (auto &op : op_handles) { + auto &dev_ctx = op->DeviceContext(); + auto &p = dev_ctx.begin()->first; + int dev_id = boost::get(p).device; + auto &dev_dummys = graphs[dev_id]->Get(kGraphDepVars); + graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release()); + + for (auto &var : op->Inputs()) { + auto dummy_ptr = dynamic_cast(var); + if (dummy_ptr) { + dev_dummys.insert(var); + if (graph->Nodes().count(var->Node())) + graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release()); + } + } + for (auto &var : op->Outputs()) { + auto dummy_ptr = dynamic_cast(var); + if (dummy_ptr) { + dev_dummys.insert(var); + if (graph->Nodes().count(var->Node())) + graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release()); + } + } + } + + for (size_t dev_id = 0; dev_id < places_.size(); ++dev_id) { + auto &dev_vars = graphs[dev_id]->Get(kGraphVars)[0]; + auto &origin_vars = graph->Get(kGraphVars)[dev_id]; + for (auto &name_pair : origin_vars) { + dev_vars.emplace(name_pair.first, name_pair.second); + for (auto &version_pair : name_pair.second) { + if (graph->Nodes().count(version_pair->Node())) { + graphs[dev_id]->AddNode( + graph->RemoveNode(version_pair->Node()).release()); + } + } + } + } + + return graphs; +} + ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, - const std::vector &places, - std::vector> &&graphs) + const std::vector &places, ir::Graph *graph) : strategy_(std::move(strategy)), local_scopes_(std::move(local_scopes)), pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), places_(std::move(places)), - graphs_(std::move(graphs)) { + // TODO(Yancey1989): Copying graphs is not safely since it deleted the + // attrs. + graphs_(SeparateMultiDevicesGraph(graph)) { PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); + auto seq_allreduce_pass = + ir::PassRegistry::Instance().Get("all_reduce_deps_pass"); + for (size_t i = 0; i < graphs_.size(); ++i) { + graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i])); + } + // set the correct size of thread pool to each device. strategy_.num_threads_ = strategy_.num_threads_ < places_.size() ? 1UL @@ -37,7 +107,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( << " to run the operators of the graph on each device."; for (size_t i = 0; i < places.size(); ++i) { executors_.emplace_back(new details::ThreadedSSAGraphExecutor( - strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); + strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get())); } } diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h index c00c5bc2d1b4b78593f99c819b5a3d642150e773..1e421f2a3a51363fe368859f7a34593c8c894077 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h @@ -18,7 +18,9 @@ #include #include "ThreadPool.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" +#include "paddle/fluid/framework/ir/graph.h" namespace paddle { namespace framework { @@ -29,13 +31,17 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { ParallelSSAGraphExecutor(const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::vector> &&graphs); + ir::Graph *graph); ~ParallelSSAGraphExecutor() final = default; + const ir::Graph &Graph() const override { return *graphs_[0]; } FeedFetchList Run(const std::vector &fetch_tensors) override; private: + std::vector> SeparateMultiDevicesGraph( + ir::Graph *graph); + ExecutionStrategy strategy_; std::vector local_scopes_; std::unique_ptr<::ThreadPool> pool_{nullptr}; diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index ee4c8a6ecf77e5d0f23f38b763917d926afdb07a..4e2477c205db5966aa0b2d0c7a608be94a69eb82 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -139,7 +139,7 @@ void ReduceOpHandle::GatherSelectedRows( #endif void ReduceOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); + platform::RecordEvent record_event(Name()); if (places_.size() == 1) return; // the input and output may have dummy var. @@ -153,7 +153,7 @@ void ReduceOpHandle::RunImpl() { { auto out_var_handles = DynamicCast(outputs_); - PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, + PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL, "The number of output should be one."); out_var_handle = out_var_handles.front(); } diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 13a042d8e6ed7f18c76387b666d681df0eabd0b5..6092143449bc8e20117e7021bd44553cf64ae5b5 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -12,9 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include +#include +#include +#include #include #include "paddle/fluid/framework/details/computation_op_handle.h" @@ -189,15 +193,6 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx, return shrink_func(computation_op); } -static VarDesc *TryGetLatestVarDesc(const std::vector &vars) { - VarDesc *var_desc = nullptr; - std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool { - var_desc = var_handle->Node()->Var(); - return var_desc != nullptr; - }); - return var_desc; -} - std::unique_ptr ReferenceCountPass::ApplyImpl( std::unique_ptr graph) const { auto &ref_cnts = Get>(kGlobalReferenceCount); diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.cc b/paddle/fluid/framework/details/reference_count_pass_helper.cc index 89bd08c2d041d795205b29bb29aba311d1dbd932..94de0e6ab0a91d90a7f2c4c4fc14eb78663c95fe 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.cc +++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc @@ -13,9 +13,22 @@ // limitations under the License. #include "paddle/fluid/framework/details/reference_count_pass_helper.h" +#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/var_desc.h" namespace paddle { namespace framework { -namespace details {} // namespace details +namespace details { + +VarDesc *TryGetLatestVarDesc(const std::vector &vars) { + VarDesc *var_desc = nullptr; + std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool { + var_desc = var_handle->Node()->Var(); + return var_desc != nullptr; + }); + return var_desc; +} + +} // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h index 1c083dbf001b08e40a54cc89b21c3dea1f18f16a..ce700119c54ddd711315dfa45d61b9241cfda651 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.h +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -25,6 +26,10 @@ namespace paddle { namespace framework { + +class VarDesc; +class VarHandle; + namespace details { class ComputationOpHandle; @@ -43,9 +48,11 @@ const char kGarbageCollector[] = "garbage_collector"; const char kAllPlaces[] = "all_places"; using LastLiveOpsOfVars = - std::unordered_map>; + std::unordered_map>; const char kLastLiveOpsOfVars[] = "last_live_ops_of_var"; +VarDesc *TryGetLatestVarDesc(const std::vector &vars); + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 91e4f9adb418978c30f512abe6924c0ace182124..7b13112986f9ad85056a3e8a5a6ed99bd0be95d5 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -63,7 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( eptr = std::current_exception(); } - platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); + platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun"); ++drop_scope_counter_; bool stream_end = false; diff --git a/paddle/fluid/framework/details/sequential_execution_pass.cc b/paddle/fluid/framework/details/sequential_execution_pass.cc index 879fb29d5926941e574d0080051c195293bc60a9..0b53a76e7877891509ea4d0334673ae2a1fcf949 100644 --- a/paddle/fluid/framework/details/sequential_execution_pass.cc +++ b/paddle/fluid/framework/details/sequential_execution_pass.cc @@ -40,7 +40,7 @@ std::unique_ptr SequentialExecutionPass::ApplyImpl( static std::unordered_set skip_dist_ops{ "send", "recv", "send_barrier", "fetch_barrier"}; - auto &ops = Get>(kAllOpDescs); + auto &ops = graph->Get>(kStaleProgramOpDescs); std::vector op_node_list; op_node_list.reserve(ops.size()); @@ -107,4 +107,4 @@ std::unique_ptr SequentialExecutionPass::ApplyImpl( REGISTER_PASS(sequential_execution_pass, paddle::framework::details::SequentialExecutionPass) - .RequirePassAttr(paddle::framework::details::kAllOpDescs); + .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs); diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 677a2937945b03fa577317cb4f26e09354d06957..9ba295a2b06a5ee9c3069e95fa688595fe72d6fd 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -23,9 +23,8 @@ namespace framework { namespace details { ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, - const std::vector &places, - std::unique_ptr &&graph) - : graph_(std::move(graph)), + const std::vector &places, ir::Graph *graph) + : graph_(graph), pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) : nullptr), local_scopes_(local_scopes), @@ -37,7 +36,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( FeedFetchList ThreadedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { std::unique_ptr event( - new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr)); + new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare")); std::unordered_map pending_ops; std::unordered_set pending_vars; auto ready_vars = std::make_shared>(); @@ -110,7 +109,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( for (auto &run_op_future : run_op_futures_) { run_op_future.wait(); } - ClearFetchOp(graph_.get(), &fetch_ops); + ClearFetchOp(graph_, &fetch_ops); exception_holder_.ReThrow(); } else { continue; @@ -135,7 +134,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( } PADDLE_ENFORCE(ready_ops.empty()); // Wait FetchOps. - ClearFetchOp(graph_.get(), &fetch_ops); + ClearFetchOp(graph_, &fetch_ops); return fetch_data; } @@ -219,7 +218,7 @@ void ThreadedSSAGraphExecutor::RunOp( VLOG(10) << op << " " << op->Name() << " Done "; running_ops_--; ready_var_q->Extend(op->Outputs()); - VLOG(10) << op << " " << op->Name() << "Signal posted"; + VLOG(10) << op << " " << op->Name() << " Signal posted"; } catch (...) { exception_holder_.Catch(std::current_exception()); } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 24da56c09e3e0f3894d58e5af8838c98e3e1e67c..0867f6210480ec405e7cc4ea42c74b750133ea4e 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -41,7 +41,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::unique_ptr &&graph); + ir::Graph *graph); const ir::Graph &Graph() const override { return *graph_; } // Run a SSAGraph by a thread pool @@ -55,7 +55,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { details::OpHandleBase *op); private: - std::unique_ptr graph_; + ir::Graph *graph_; std::unique_ptr<::ThreadPool> pool_; std::vector local_scopes_; std::vector places_; diff --git a/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..fd6b6dd2274d9721b8754e16cd7b4f1ab596380d --- /dev/null +++ b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +class WhileOpEagerDeletionPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override { + auto all_ops = ir::FilterByNodeWrapper(*graph); + + // Find all while_op and while_grad_op + std::unordered_map, + std::vector>> + target_ops; + for (auto *op : all_ops) { + auto compute_op = dynamic_cast(op); + if (compute_op == nullptr) continue; + + if (compute_op->Name() == "while") { + target_ops[compute_op->GetScopeIdx()].first.emplace_back( + compute_op->GetOp()); + } else if (compute_op->Name() == "while_grad") { + target_ops[compute_op->GetScopeIdx()].second.emplace_back( + compute_op->GetOp()); + } + } + + for (auto &ops_pair : target_ops) { + auto &while_ops = ops_pair.second.first; + auto &while_grad_ops = ops_pair.second.second; + operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + while_ops, while_grad_ops); + } + return graph; + } +}; + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(while_op_eager_deletion_pass, + paddle::framework::details::WhileOpEagerDeletionPass); diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 4323883fa5cc9b26a68c2980f3b7a49eca610543..99192292b0be992d5ff0ecebba6294b9ba27e958 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -14,25 +14,31 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include +#include +#include +#include +#include #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_NGRAPH #include "paddle/fluid/operators/ngraph/ngraph_engine.h" +DEFINE_bool(use_ngraph, false, "Use NGRAPH to run"); #endif DECLARE_bool(benchmark); DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); -DEFINE_bool(use_ngraph, false, "Use NGRAPH to run"); namespace paddle { namespace framework { @@ -74,11 +80,11 @@ static std::unordered_map GetNonPersistableReferenceCounts( ExecutorPrepareContext::ExecutorPrepareContext( const framework::ProgramDesc& prog, size_t block_id, - const std::vector& skip_ref_cnt_vars) - : prog_(prog), block_id_(block_id) { - if (GetEagerDeletionThreshold() >= 0) { - global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id), - skip_ref_cnt_vars); + const std::vector& keep_vars, bool force_disable_gc) + : prog_(prog), block_id_(block_id), force_disable_gc_(force_disable_gc) { + if (GetEagerDeletionThreshold() >= 0 && !force_disable_gc_) { + global_ref_cnts_ = + GetNonPersistableReferenceCounts(prog.Block(block_id), keep_vars); } } @@ -183,13 +189,12 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, } void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, - bool create_local_scope, bool create_vars) { + bool create_local_scope, bool create_vars, + const std::vector& skip_ref_cnt_vars, + bool force_disable_gc) { platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); -#ifdef PADDLE_WITH_NGRAPH - if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc); -#endif - auto ctx = Prepare(pdesc, block_id); + auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); } @@ -356,20 +361,27 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, std::unique_ptr Executor::Prepare( const ProgramDesc& program, int block_id, - const std::vector& skip_ref_cnt_vars) { - std::unique_ptr ctx( - new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars)); + const std::vector& skip_ref_cnt_vars, bool force_disable_gc) { + std::unique_ptr ctx(new ExecutorPrepareContext( + program, block_id, skip_ref_cnt_vars, force_disable_gc)); PADDLE_ENFORCE_LT(static_cast(block_id), program.Size()); auto& block = program.Block(block_id); for (auto& op_desc : block.AllOps()) { ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); } +#ifdef PADDLE_WITH_NGRAPH + if (FLAGS_use_ngraph) { + paddle::operators::NgraphEngine::FuseNgraphOps( + ctx->prog_.Block(ctx->block_id_), &ctx->ops_); + } +#endif return ctx; } std::vector> Executor::Prepare( const ProgramDesc& program, const std::vector& block_ids, - const std::vector>& skip_ref_cnt_vars) { + const std::vector>& skip_ref_cnt_vars, + bool force_disable_gc) { PADDLE_ENFORCE( skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(), "skip_ref_cnt_vars should be either empty or equals to block number %d", @@ -379,9 +391,11 @@ std::vector> Executor::Prepare( for (auto& bid : block_ids) { ExecutorPrepareContext* ctx; if (skip_ref_cnt_vars.empty()) { - ctx = new ExecutorPrepareContext(program, bid); + ctx = new ExecutorPrepareContext(program, bid, std::vector(), + force_disable_gc); } else { - ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]); + ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx], + force_disable_gc); } PADDLE_ENFORCE_LT(static_cast(bid), program.Size()); auto& block = program.Block(bid); @@ -408,8 +422,9 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; - // skip while_op and while_grad_op temporarily - if (max_memory_size >= 0 && !keep_kids) { + // FIXME(zjl): recurrent_op is rather complex, we would + // disable gc forcely in recurrent_op + if (!ctx->force_disable_gc_ && max_memory_size >= 0) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { @@ -427,6 +442,11 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, #ifdef PADDLE_WITH_CUDA } #endif + // If gc is enabled and block size > 1 + if (gc && ctx->prog_.Size() > 1) { + operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_, + ctx->ops_); + } } for (auto& op : ctx->ops_) { diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 5a040ac641588ad4d89d1f6e4c0d6c296eff38eb..65cb9e51ab2c9208b6bfbbed54f4136ffbd627ff 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -15,7 +15,9 @@ limitations under the License. */ #pragma once #include +#include #include +#include #include #include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/op_info.h" @@ -30,7 +32,8 @@ namespace framework { struct ExecutorPrepareContext { ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id, const std::vector& skip_ref_cnt_vars = - std::vector()); + std::vector(), + bool force_disable_gc = false); ~ExecutorPrepareContext(); @@ -38,6 +41,7 @@ struct ExecutorPrepareContext { const framework::ProgramDesc& prog_; size_t block_id_; + bool force_disable_gc_; std::vector> ops_; std::unordered_map global_ref_cnts_; @@ -66,7 +70,10 @@ class Executor { * Scope */ void Run(const ProgramDesc& prog, Scope* scope, int block_id, - bool create_local_scope = true, bool create_vars = true); + bool create_local_scope = true, bool create_vars = true, + const std::vector& skip_ref_cnt_vars = + std::vector(), + bool force_disable_gc = false); // This API is very slow. void Run(const ProgramDesc& program, Scope* scope, @@ -79,12 +86,14 @@ class Executor { static std::unique_ptr Prepare( const ProgramDesc& program, int block_id, const std::vector& skip_ref_cnt_vars = - std::vector()); + std::vector(), + bool force_disable_gc = false); static std::vector> Prepare( const ProgramDesc& program, const std::vector& block_ids, const std::vector>& skip_ref_cnt_vars = - std::vector>()); + std::vector>(), + bool force_disable_gc = false); void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id); diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc index 3e4d715c6f089496d1b1f7906e3f10147a073622..bf9d1dcd380cdff886301faf13b0015fd5a2ed5c 100644 --- a/paddle/fluid/framework/inplace_op_inference_test.cc +++ b/paddle/fluid/framework/inplace_op_inference_test.cc @@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) { op->SetOutput("Out", {"test2_out"}); prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64}); + prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128}); prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_out"); - prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128}); auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto in_to_outs = infer_inplace(*op, op->Block()); @@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) { op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"}); prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024}); prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_out"); - prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024}); auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto in_to_outs = infer_inplace(*op, op->Block()); @@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) { prog.MutableBlock(0)->Var("o0"); prog.MutableBlock(0)->Var("y0"); prog.MutableBlock(0)->Var("z0"); - prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto in_to_outs = infer_inplace(*op, op->Block()); @@ -267,12 +267,12 @@ TEST(InferInplace, MultiGradInplaceInToOut) { prog.MutableBlock(0)->Var("o0"); prog.MutableBlock(0)->Var("y0"); prog.MutableBlock(0)->Var("z0"); - prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); - prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); + prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto in_to_outs = infer_inplace(*op, op->Block()); diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 07c2c970d4de3cecf03e4cf80e60e81e7a9595a8..ca6b0229e906c0f8bfbf9ee6781013cb4ef7bbce 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -102,6 +102,8 @@ cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DE cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) if (WITH_MKLDNN) cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) + cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass) + cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass) endif () diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h index a756dfc1b98e1de55c809c73e2c4df1e628950ae..39b0585d3a6f9b52c9ec4b0a24f8532a3410851a 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h @@ -22,7 +22,8 @@ namespace ir { class AttentionLSTMFusePass : public FusePassBase { protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h index ad966e11e6222a4ed4c730089c454b0d1c7bd0b3..8c3c8b56c08cc09e66b20d17bf730edec0499f35 100644 --- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h @@ -31,7 +31,8 @@ class ConvAffineChannelFusePass : public FusePassBase { virtual ~ConvAffineChannelFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; const std::string name_scope_{"conv_affine_channel_fuse"}; }; @@ -40,7 +41,8 @@ class ConvEltwiseAddAffineChannelFusePass : public FusePassBase { virtual ~ConvEltwiseAddAffineChannelFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"}; }; diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc index 846a14e365e6bd7f056d409130a3b246371931da..04765dd1440331fb37ed2eb05a9ce762eb2b81bc 100644 --- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc @@ -169,7 +169,7 @@ std::unique_ptr ConvBNFusePass::ApplyImpl( if (has_bias && conv->Op()->Input("Bias").size() > 0) { // reuse existing conv bias node auto conv_bias_names = conv->Op()->Input("Bias"); - PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1); + PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1UL); auto* conv_bias_var = scope->FindVar(conv_bias_names[0]); auto* conv_bias_tensor = conv_bias_var->GetMutable(); PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(), diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h index 2c9eb574fe8e054e0ae221f08f664b91f05d95c9..cf425a2730904d4ab21c33e66b72db0692cb087c 100644 --- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h @@ -31,7 +31,8 @@ class ConvBNFusePass : public FusePassBase { virtual ~ConvBNFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; const std::string name_scope_{"conv_bn_fuse"}; }; @@ -40,7 +41,8 @@ class ConvEltwiseAddBNFusePass : public FusePassBase { virtual ~ConvEltwiseAddBNFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; const std::string name_scope_{"conv_eltwiseadd_bn_fuse"}; }; diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h index 3b40a5a92665c07bc2b66e6a96721f573d40393f..9259a4ac5c89b1a7d1413fb2eaaa5fc6a70348f2 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h @@ -25,7 +25,8 @@ class ConvElementwiseAdd2ActFusePass : public FusePassBase { virtual ~ConvElementwiseAdd2ActFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h index ac69aa6458fc8c19b670dea2af1251c44dc353a8..9c0b50f155821cf2bd815a6fb087e3f6cc513641 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h @@ -25,7 +25,8 @@ class ConvElementwiseAddActFusePass : public FusePassBase { virtual ~ConvElementwiseAddActFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h index f234603f5856a9238164f7fb0e5cc81ea9b7ed60..bf43bd5ce2602a3e240c56f00f66f13b79151002 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h @@ -25,7 +25,8 @@ class ConvElementwiseAddFusePass : public FusePassBase { virtual ~ConvElementwiseAddFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h index e5ad3067ec4060e41f1464395f3fc76183de3e66..fde2a0a4eecdec9ad5ac58ad8e63c26cce482682 100644 --- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h +++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h @@ -14,6 +14,8 @@ #pragma once +#include + #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" @@ -30,7 +32,8 @@ class EmbeddingFCLSTMFusePass : public FusePassBase { virtual ~EmbeddingFCLSTMFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; const std::string name_scope_{"embedding_fc_lstm_fuse"}; }; diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h index 6c69539d1e48268afc2435f8f73b3818d13107cd..783a052edcf84c8c437a7b2e25f0d67c0366691e 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_fuse_pass.h @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once + #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" @@ -29,7 +31,8 @@ class FCFusePass : public FusePassBase { virtual ~FCFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h index 63e1c72bfb2e2641ae5d44858b342d5e427e9045..e359a3289440fffbec622488ecf3a7f49e986574 100644 --- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h @@ -30,7 +30,8 @@ class FCGRUFusePass : public FusePassBase { virtual ~FCGRUFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; const std::string name_scope_{"fc_gru_fuse"}; }; @@ -41,7 +42,8 @@ class MulGRUFusePass : public FusePassBase { virtual ~MulGRUFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; const std::string name_scope_{"fc_nobias_gru_fuse"}; }; diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h index 3ee32c63a46fcc34bdccd1e14d4bbaf9668c49e9..21482615a6efef930b7328594477a51f4aaf28e7 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h @@ -14,6 +14,8 @@ #pragma once +#include + #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" @@ -30,7 +32,8 @@ class FCLstmFusePass : public FusePassBase { virtual ~FCLstmFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; const std::string name_scope_{"fc_lstm_fuse"}; }; @@ -40,7 +43,8 @@ class MulLstmFusePass : public FusePassBase { virtual ~MulLstmFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; const std::string name_scope_{"fc_nobias_lstm_fuse"}; }; diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h index b2fecc076efca333539fe81e67eee222873aee2a..0fee5274478e8b8db852774077ff5979f0aaba25 100644 --- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h +++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h @@ -32,7 +32,8 @@ class FuseElewiseAddActPass : public FusePassBase { virtual ~FuseElewiseAddActPass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; std::unique_ptr FuseElewiseAddAct( std::unique_ptr graph, diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h index c53b2a6186741d86f14faf1d21fa19aa09cec036..3a1022bbcbd671391fb034bdff7c3cf97952f84d 100644 --- a/paddle/fluid/framework/ir/fuse_pass_base.h +++ b/paddle/fluid/framework/ir/fuse_pass_base.h @@ -14,6 +14,7 @@ #pragma once +#include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/scope.h" @@ -24,6 +25,10 @@ namespace ir { static const char kParamScopeAttr[] = "__param_scope__"; static const char kFuseStatisAttr[] = "__fuse_statis__"; +// When we use trt or other third_party lib, the parameters are managed by +// the lib, but not the fluid. So we need to record them to avoid duplicate +// allocation. +static const char kRepetitiveParamAttr[] = "__repetitive_param__"; enum FuseOptions { DO_NOT_FUSE, // fusing will not be done diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc index 0d94008ea82d0e09732d4b6448fdded94b60733c..fe844caed2e757fb080dcee398c8903b929b06e5 100644 --- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc +++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc @@ -111,7 +111,7 @@ std::unique_ptr FuseReluDepthwiseConvPass::FuseReluDepthwiseConv( xg_var = subgraph.at(xg)->Var(); } - PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1); + PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1UL); PADDLE_ENFORCE_EQ(layer_op->Input("Input")[0], y_var->Name()); layer_op->SetInput("Input", {x_var->Name()}); subgraph.at(layer)->inputs.push_back(subgraph.at(x)); @@ -119,13 +119,13 @@ std::unique_ptr FuseReluDepthwiseConvPass::FuseReluDepthwiseConv( VLOG(4) << "replace " << y_var->Name() << " -> " << x_var->Name(); if (!only_forward) { - PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1); + PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1UL); PADDLE_ENFORCE_EQ(layer_g_op->Input("Input")[0], y_var->Name()); layer_g_op->SetInput("Input", {x_var->Name()}); subgraph.at(layer_g)->inputs.push_back(subgraph.at(x)); subgraph.at(x)->outputs.push_back(subgraph.at(layer_g)); - PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1); + PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1UL); PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input"))[0], yg_var->Name()); layer_g_op->SetOutput(GradVarName("Input"), {xg_var->Name()}); diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h index 6bd653775e42c1ee16051e205e9fa9888ea05eaa..efb49b8300e677f17d9e205800d837b88edfd2e9 100644 --- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h +++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h @@ -32,7 +32,8 @@ class FuseReluDepthwiseConvPass : public FusePassBase { virtual ~FuseReluDepthwiseConvPass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; std::unique_ptr FuseReluDepthwiseConv( std::unique_ptr graph, bool only_forward) const; }; diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 4b5c846f3271b2dd5e094020571069aff590cd2b..6a9340b870df324f7dea03181bdb2b097e13e705 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include +#include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/op_proto_maker.h" @@ -76,6 +76,9 @@ std::map> Graph::InitFromProgram( var->inputs.push_back(node); } } + Set>( + details::kStaleProgramOpDescs, + new std::vector(program.Block(0).AllOps())); return var_nodes; } @@ -149,6 +152,39 @@ void Graph::ResolveHazard( } } +std::shared_ptr Graph::Clone() { + auto cloned_graph = std::make_shared(this->program_); + cloned_graph->ReleaseNodes(); + cloned_graph->num_node_created_ = 0; + std::unordered_map origin_to_cloned; + for (auto *n : this->node_set_) { + ir::Node *cloned_node = nullptr; + if (n->IsCtrlVar()) { + cloned_node = cloned_graph->CreateControlDepVar(); + } else if (!n->var_desc_ && !n->op_desc_) { // empty node + cloned_node = cloned_graph->CreateEmptyNode(n->Name(), n->NodeType()); + } else if (n->IsVar()) { + cloned_node = cloned_graph->CreateVarNode(n->Var()); + } else if (n->IsOp()) { + cloned_node = cloned_graph->CreateOpNode(n->Op()); + } + if (cloned_node) { + origin_to_cloned[n] = cloned_node; + } else { + PADDLE_THROW("The cloned node's type is not supported!"); + } + } + for (auto *n : this->node_set_) { + for (auto it = n->inputs.begin(); it != n->inputs.end(); it++) { + origin_to_cloned[n]->inputs.push_back(origin_to_cloned[*it]); + } + for (auto it = n->outputs.begin(); it != n->outputs.end(); it++) { + origin_to_cloned[n]->outputs.push_back(origin_to_cloned[*it]); + } + } + return cloned_graph; +} + bool IsControlDepVar(const ir::Node &var) { return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos; } diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index feb3330176490e71c51cce826fe49d5499469ad7..fff015d4a6f0c631017458ceb039ae3f1deb0e2c 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include #include "paddle/fluid/framework/ir/node.h" @@ -26,6 +27,14 @@ limitations under the License. */ namespace paddle { namespace framework { + +namespace details { + +// This attr is not recommended, because the graph should not dependence +// the program once it is built. +constexpr char kStaleProgramOpDescs[] = "stale_program_op_descs"; +} // namespace details + namespace ir { /* @@ -168,10 +177,13 @@ class Graph { return ret; } - void RemoveNode(ir::Node *node) { + std::unique_ptr RemoveNode(ir::Node *node) { PADDLE_ENFORCE(node_set_.find(node) != node_set_.end()); - node_set_.erase(node); + std::unique_ptr ret; + ret.reset(nodes_.at(node).release()); nodes_.erase(node); + node_set_.erase(node); + return ret; } // NOTE low performance, but simple and secure. @@ -184,12 +196,16 @@ class Graph { return nullptr; } - void ResolveHazard( - const std::map> &var_nodes); - - private: - std::map> InitFromProgram( - const ProgramDesc &program); + // Returns reference to the original program. + // WARN: After a series of passes, the current graph can be quite + // different from OriginProgram. Caller shouldn't assume much from + // the returned OriginProgram. + const ProgramDesc &OriginProgram() const { + LOG(WARNING) << "WARN: After a series of passes, the current graph can be " + "quite different from OriginProgram. So, please avoid " + "using the `OriginProgram()` method!"; + return program_; + } // This method takes ownership of `node`. ir::Node *AddNode(ir::Node *node) { @@ -199,6 +215,17 @@ class Graph { return node; } + void ResolveHazard( + const std::map> &var_nodes); + + // Create a new and duplicated graph. + // WARN: The method only clones the graph structure, not its attributes. + std::shared_ptr Clone(); + + private: + std::map> InitFromProgram( + const ProgramDesc &program); + // NOTE: program_ shouldn't be exposed to user. const ProgramDesc program_; std::map attrs_; diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 22d4c0a91cc1638264a8c57aa2841ff4e65a1400..28a37f331c100695f0ffec7288db84f4493d68a0 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -130,15 +130,21 @@ std::map> BuildOperationAdjList( if (adj_list.find(n) == adj_list.end()) { adj_list[n] = std::unordered_set(); } + std::vector nodes; for (auto &var : n->inputs) { for (auto &adj_n : var->inputs) { PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) << " -> " << n->Name() << reinterpret_cast(n) << " via " << var->Name() << reinterpret_cast(var); - adj_list[n].insert(adj_n); + nodes.push_back(adj_n); } } + std::sort(nodes.begin(), nodes.end(), [](ir::Node *node1, ir::Node *node2) { + return node1->id() > node2->id(); + }); + adj_list[n].insert(std::make_move_iterator(nodes.begin()), + std::make_move_iterator(nodes.end())); } return adj_list; } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 9ea0729e1f3339c2f17371ecc8fa51325b9629bb..c0c34d186b00814fe6c6fd42beb78133233a1357 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -38,7 +38,7 @@ size_t PDPattern::id_ = 0UL; PDNode *PDPattern::NewNode(const std::string &name) { if (!name.empty()) { - PADDLE_ENFORCE_EQ(node_map_.count(name), 0, + PADDLE_ENFORCE_EQ(node_map_.count(name), 0UL, "PDNode's name should be unique, get duplicate [%s]", name); } @@ -51,7 +51,7 @@ PDNode *PDPattern::NewNode(const std::string &name) { PDNode *PDPattern::NewNode(PDNode::teller_t &&teller, const std::string &name) { if (!name.empty()) { - PADDLE_ENFORCE_EQ(node_map_.count(name), 0, + PADDLE_ENFORCE_EQ(node_map_.count(name), 0UL, "PDNode's name should be unique, get duplicate [%s]", name); } diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc index 3b738aa159ebfd77f00c9e532fbd94542e2097db..5bdc0c5faed7131b873edf9b43c847c010b6e3f3 100644 --- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc +++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc @@ -38,9 +38,13 @@ std::unique_ptr IdentityScaleOpCleanPass::ApplyImpl( ->assert_is_op("scale") ->assert_op_attr("scale", 1.) ->assert_op_attr("bias", 0.); - auto scale_out = detector.mutable_pattern() - ->NewNode("scale_out") - ->assert_is_op_output("scale"); + auto scale_out = + detector.mutable_pattern() + ->NewNode("scale_out") + ->assert_is_op_output("scale") + // scale's output var should has only one consumer, or it can't be + // removed. + ->assert_more([](Node* x) { return x->outputs.size() == 1UL; }); pre_op->LinksTo({scale_in}); scale_op->LinksFrom({scale_in}).LinksTo({scale_out}); diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h index 50a654d82f0e4fb7e8e91c665397716407e6d2a5..6da592561da1e4046acbfd86c04862f69b7a97a8 100644 --- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h +++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h @@ -22,7 +22,8 @@ namespace ir { class IdentityScaleOpCleanPass : public FusePassBase { protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; private: virtual ~IdentityScaleOpCleanPass() = default; diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h index 7310f596f8a3170e84840be4bab8390b780b6577..f9157b10d9554092a5da6a6f73ecf7ceac1430dd 100644 --- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h +++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h @@ -60,7 +60,8 @@ class LockFreeOptimizePass : public Pass { virtual ~LockFreeOptimizePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; private: // Create a new sgd node via current optimizer node diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h index f3ad9f1c2bf14db418629e0c607e2510f01908b8..0ef5c177bf98b354bb18fc1d2ec8e5bef4b58951 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h @@ -29,7 +29,8 @@ class ConvBiasFusePass : public FusePassBase { virtual bool is_conv3d() const { return false; } protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; const std::string name_scope_{"conv_bias_mkldnn_fuse"}; }; /* diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..38b7fe52037c1a264e4251b7a54ef7569ee6d765 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc @@ -0,0 +1,151 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h" +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/platform/place.h" + +#include +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + if (type == "conv2d") { + op->SetAttr("use_mkldnn", true); + op->SetAttr("name", name); + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + if (inputs.size() > 2) + op->SetInput("Bias", {inputs[2]}); + else + op->SetInput("Bias", {}); + } else if (type == "elementwise_add") { + op->SetAttr("use_mkldnn", true); + op->SetInput("X", {inputs[0]}); + op->SetInput("Y", {inputs[1]}); + } + op->SetOutput("Out", outputs); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); +} + +// (c, weights)->conv->f +// (f)->elementwise_add->g +ProgramDesc BuildProgramDesc(bool convWithExistingBias) { + ProgramDesc prog; + std::vector nodes{"c", "weights", "f", "eltwise_bias", "g"}; + if (convWithExistingBias) nodes.push_back("conv_bias"); + for (auto& v : nodes) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::LOD_TENSOR); + if (v == "weights" || v == "conv_bias" || v == "eltwise_bias") { + var->SetPersistable(true); + } + } + + // conv+bias, both with MKL-DNN + if (convWithExistingBias) { + SetOp(&prog, "conv2d", "conv", + std::vector({"c", "weights", "conv_bias"}), + std::vector({"f"})); + } else { + SetOp(&prog, "conv2d", "conv", std::vector({"c", "weights"}), + std::vector({"f"})); + } + SetOp(&prog, "elementwise_add", "eltwise", + std::vector({"f", "eltwise_bias"}), + std::vector({"g"})); + + return prog; +} + +void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, + const char* var_name) { + auto x = scope->Var(var_name); + auto tensor = x->GetMutable(); + tensor->mutable_data(place, proto::VarType::FP32, + ::paddle::memory::Allocator::kDefault, 1); +} + +void MainTest(bool convWithExistingBias) { + auto prog = BuildProgramDesc(convWithExistingBias); + std::unique_ptr graph(new ir::Graph(prog)); + auto place = paddle::platform::CPUPlace(); + NaiveExecutor exe{place}; + Scope scope; + // Init scope, as it is used in pass + exe.CreateVariables(prog, 0, true, &scope); + if (convWithExistingBias) { + InitTensorHolder(&scope, place, "conv_bias"); + InitTensorHolder(&scope, place, "eltwise_bias"); + } + graph->Set(kParamScopeAttr, new framework::Scope*(&scope)); + + auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass"); + + int original_nodes_num = graph->Nodes().size(); + + graph = pass->Apply(std::move(graph)); + + int current_nodes_num = graph->Nodes().size(); + + // Remove 3 Nodes: Conv, Bias, conv_out + // Add 1 Node: ConvBias + EXPECT_EQ(original_nodes_num - 2, current_nodes_num); + + // Assert conv_bias op in newly generated graph + int conv_bias_count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == "conv2d") { + auto* op = node->Op(); + ASSERT_TRUE(op->HasAttr("use_mkldnn")); + EXPECT_TRUE(boost::get(op->GetAttr("use_mkldnn"))); + // check if "conv" convolution is fused + auto op_name = boost::get(op->GetAttr("name")); + if (op_name == "conv") { + auto input_names = op->InputNames(); + ASSERT_TRUE(std::find(input_names.begin(), input_names.end(), "Bias") != + input_names.end()); + auto bias = boost::get>(op->Input("Bias")); + if (bias.size()) { + ++conv_bias_count; + } + } + } + } + EXPECT_EQ(conv_bias_count, 1); +} + +TEST(ConvBiasFusePass, bias_free_conv) { MainTest(false); } + +TEST(ConvBiasFusePass, conv_with_existing_bias) { MainTest(true); } + +TEST(ConvBiasFusePass, conv3d) { + Conv3DBiasFusePass pass; + ASSERT_TRUE(pass.is_conv3d()); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(conv_bias_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc index 9ef5c298b8cddfec094e9544dc6da9afdcaf0dab..433d89d8d3f20b3f87cd94901ebbf79cd99de813 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc @@ -44,10 +44,14 @@ struct TestIsReachable { using func = std::function; auto operator()(const std::unique_ptr& graph) -> func { - auto find_node = [](const std::unique_ptr& graph, - const std::string& name) -> Node* { + auto hash = [](const Node* node) -> std::string { + return node->Name() + std::to_string(node->id()); + }; + + auto find_node = [&](const std::unique_ptr& graph, + const std::string& name) -> Node* { for (auto& node : GraphTraits::DFS(*graph)) { - if (name == node.Name()) { + if (name == hash(&node)) { return &node; } } @@ -55,13 +59,17 @@ struct TestIsReachable { return nullptr; }; - return [&](std::string from, const std::string to) -> bool { + // update the from and to strings to hashed equivs in loop from graph traits + return [&](std::string from, std::string to) -> bool { if (from == to) return true; std::map visited; for (auto& node : GraphTraits::DFS(*graph)) { - visited[node.Name()] = false; + auto hashed = hash(&node); + if (node.Name() == from) from = hashed; + if (node.Name() == to) to = hashed; + visited[hashed] = false; } visited[from] = true; @@ -72,15 +80,15 @@ struct TestIsReachable { while (!queue.empty()) { auto cur = find_node(graph, queue.front()); queue.pop_front(); - if (cur == nullptr) return false; for (auto n : cur->outputs) { - if (n->Name() == to) return true; + auto hashed_name = hash(n); + if (hashed_name == to) return true; - if (!visited[n->Name()]) { - visited[n->Name()] = true; - queue.push_back(n->Name()); + if (!visited[hashed_name]) { + visited[hashed_name] = true; + queue.push_back(hashed_name); } } } @@ -166,6 +174,28 @@ TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsYWithElementwiseAddRelu) { RunPassAndAssert(&prog, "a", "relu", 1); } +TEST(ConvElementwiseAddMKLDNNFusePass, + ConvolutionProjectionAsYWithElementwiseAddRelu) { + auto prog = BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, + {"bias", "weights", "bias2", "weights2"}); + + SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"}); + // right branch + SetOp(&prog, "conv2d", + {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}}, + {"Output", "c"}); + + // left branch + SetOp(&prog, "conv2d", + {{"Input", "a"}, {"Bias", "bias2"}, {"Filter", "weights2"}}, + {"Output", "f"}); + + SetOp(&prog, "elementwise_add", {{"X", "f"}, {"Y", "c"}}, {"Out", "d"}); + SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"}); + + RunPassAndAssert(&prog, "a", "relu", 2); +} + TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsYWithElementwiseAddReluNoBias) { auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"}); diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc index 20e52410ffe3caa86450bc05bf3aabf5a5bce374..ccac65f3b3ad22d0f424ef9de9a7bd506e8ac862 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc @@ -21,7 +21,7 @@ namespace ir { std::unique_ptr MKLDNNPlacementPass::ApplyImpl( std::unique_ptr graph) const { - VLOG(3) << "Aplies MKL-DNN placement strategy."; + VLOG(3) << "Applies MKL-DNN placement strategy."; const auto& op_types_list = Get>("mkldnn_enabled_op_types"); for (const Node* n : graph->Nodes()) { diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..b6ec7e4d68b95125d630ce4a60635eb7b711e820 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc @@ -0,0 +1,136 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h" + +#include +#include + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs, boost::tribool use_mkldnn) { + auto* op = prog->MutableBlock(0)->AppendOp(); + + op->SetType(type); + + if (!boost::indeterminate(use_mkldnn)) op->SetAttr("use_mkldnn", use_mkldnn); + + if (type == "conv2d") { + op->SetAttr("name", name); + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + op->SetInput("Bias", {inputs[2]}); + } else if (type == "relu") { + op->SetInput("X", inputs); + } else if (type == "concat") { + op->SetAttr("axis", 1); + op->SetInput("X", {inputs[0], inputs[1]}); + } else if (type == "pool2d") { + op->SetInput("X", {inputs[0]}); + } else { + FAIL() << "Unexpected operator type."; + } + op->SetOutput("Out", {outputs[0]}); +} + +// operator use_mkldnn +// --------------------------------------- +// (a,b)->concat->c none +// (c,weights,bias)->conv->f none +// f->relu->g false +// g->pool->h false +// (h,weights2,bias2)->conv->k true +// k->relu->l true +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + + for (auto& v : + std::vector({"a", "b", "c", "weights", "bias", "f", "g", + "h", "weights2", "bias2", "k", "l"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::SELECTED_ROWS); + if (v == "weights" || v == "bias") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "concat", "concat1", std::vector({"a", "b"}), + std::vector({"c"}), boost::indeterminate); + SetOp(&prog, "conv2d", "conv1", + std::vector({"c", "weights", "bias"}), + std::vector({"f"}), boost::indeterminate); + SetOp(&prog, "relu", "relu1", std::vector({"f"}), + std::vector({"g"}), false); + SetOp(&prog, "pool2d", "pool1", std::vector({"g"}), + std::vector({"h"}), false); + SetOp(&prog, "conv2d", "conv2", + std::vector({"h", "weights2", "bias2"}), + std::vector({"k"}), true); + SetOp(&prog, "relu", "relu2", std::vector({"k"}), + std::vector({"l"}), true); + + return prog; +} + +void MainTest(std::initializer_list mkldnn_enabled_op_types, + unsigned expected_use_mkldnn_true_count) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("mkldnn_placement_pass"); + pass->Set("mkldnn_enabled_op_types", + new std::unordered_set(mkldnn_enabled_op_types)); + + graph = pass->Apply(std::move(graph)); + + unsigned use_mkldnn_true_count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->HasAttr("use_mkldnn") && + boost::get(op->GetAttr("use_mkldnn"))) { + ++use_mkldnn_true_count; + } + } + } + + EXPECT_EQ(use_mkldnn_true_count, expected_use_mkldnn_true_count); +} + +TEST(MKLDNNPlacementPass, enable_conv_relu) { + // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 0 pool + MainTest({"conv2d", "relu"}, 3); +} + +TEST(MKLDNNPlacementPass, enable_relu_pool) { + // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool + MainTest({"relu", "pool2d"}, 4); +} + +TEST(MKLDNNPlacementPass, enable_all) { + // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool + MainTest({}, 4); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(mkldnn_placement_pass); diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 9eade9eaa8f00fe6e76063344f47968f4e97cf7f..72fb876d98dc84164398583baf22c49014af483a 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include #include diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h index 3f3f0846eba1201e57a653f8e515c28d2bcdd5e3..ede0bea07ff4130a0f6b3d21d6e34222a5013170 100644 --- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h +++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h @@ -31,7 +31,8 @@ class RepeatedFCReluFusePass : public FusePassBase { virtual ~RepeatedFCReluFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; const std::string name_scope_{"repeated_fc_relu_fuse"}; }; diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h index 9f5fd1a29adf918806d8f30097d8c7f002f48f3e..06e18f9dc327bf2ffaf8d2ab64edcbddea2eb04c 100644 --- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once + #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" @@ -25,7 +27,8 @@ class SeqConcatFcFusePass : public FusePassBase { virtual ~SeqConcatFcFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h index dac9de71930c1768bdf416520caae6468449cd3d..c36c6b76a238dd21eb0c9308e780761aa9e4e27a 100644 --- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h +++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h @@ -28,7 +28,8 @@ class SeqConvEltAddReluFusePass : public FusePassBase { virtual ~SeqConvEltAddReluFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; const std::string name_scope_{"seqconv_eltadd_relu_fuse"}; }; diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h index ba2154045e62c687173565c5ad30ea4d45d3c8f4..a5db3528da36ad08bb7f4d2765ee78222c569a5c 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h @@ -42,7 +42,8 @@ class SeqPoolConcatFusePass : public FusePassBase { virtual ~SeqPoolConcatFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; const std::string name_scope_{"seqpool_concat_fuse"}; }; diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h index fb49adc3768ec99cab4321c6b90c93dfed6d32f2..c21ba65c40a8d54c315ab347e5a8a3266a143779 100644 --- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h +++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h @@ -31,7 +31,8 @@ class SquaredMatSubFusePass : public FusePassBase { virtual ~SquaredMatSubFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; const std::string name_scope_{"squared_mat_sub_fuse"}; }; diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h index fb0f0ae9efdc5a25a799d6123fa658a99860cd86..a7d18ec86da1c02aef84c25c378691eb8f651015 100644 --- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h +++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h @@ -30,7 +30,8 @@ class TransposeFlattenConcatFusePass : public FusePassBase { virtual ~TransposeFlattenConcatFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 0a0f8f4655bc34cdb25205ff6eaec9f96c801ebd..5f3ce60e1d94ea4078cf0b709df362bad317f621 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -27,7 +27,7 @@ enum class OpRole { kForward = 0x0000, kBackward = 0x0001, kOptimize = 0x0002, - // RPC role is for send/recv releated op + // RPC role is for send/recv related op kRPC = 0x0004, // Dist role is for split_byref/split_selected_rows/concat // used for distributed training. diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 2c1648c81fc999c6306d5b08bc243f3ad21fec04..a53a81c270aeec1b6ee4ed30e77526f4ea2e7977 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -290,7 +290,7 @@ struct OpKernelRegistrarFunctorEx>( OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); if (callstack.empty()) { - throw; + throw std::move(exception); } std::ostringstream sout; sout << "Invoke operator " << Type() << " error.\n"; @@ -206,7 +204,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { sout << "C++ Callstacks: \n"; sout << exception.err_str_; exception.err_str_ = sout.str(); - throw; + throw std::move(exception); } catch (...) { std::rethrow_exception(std::current_exception()); } @@ -469,12 +467,6 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const { return it->second.empty() ? nullptr : it->second[0]; } -const Variable* ExecutionContext::LegacyInputVar( - const std::string& name) const { - auto ipt = op_.Input(name); - return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); -} - Variable* ExecutionContext::OutputVar(const std::string& name) const { auto it = ctx_.outputs.find(name); if (it == ctx_.outputs.end()) return nullptr; @@ -485,22 +477,11 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const { return it->second.empty() ? nullptr : it->second[0]; } -Variable* ExecutionContext::LegacyOutputVar(const std::string& name) const { - auto opt = op_.Output(name); - return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); -} - template <> const Tensor* ExecutionContext::Input(const std::string& name) const { return Input(name); } -template <> -const Tensor* ExecutionContext::LegacyInput( - const std::string& name) const { - return LegacyInput(name); -} - template <> const std::vector ExecutionContext::MultiInput( const std::string& name) const { @@ -523,35 +504,11 @@ const std::vector ExecutionContext::MultiInput( return res; } -template <> -const std::vector ExecutionContext::LegacyMultiInput( - const std::string& name) const { - auto names = op().Inputs(name); - std::vector res; - res.reserve(names.size()); - std::transform(names.begin(), names.end(), std::back_inserter(res), - [&](const std::string& sub_name) -> const Tensor* { - auto var = scope_.FindVar(sub_name); - if (var == nullptr) return nullptr; - PADDLE_ENFORCE( - var->IsType(), - "%s should be LoDTensor, but the received type is %s", - sub_name, ToTypeName(var->Type())); - return &(var->Get()); - }); - return res; -} - template <> Tensor* ExecutionContext::Output(const std::string& name) const { return Output(name); } -template <> -Tensor* ExecutionContext::LegacyOutput(const std::string& name) const { - return LegacyOutput(name); -} - template <> std::vector ExecutionContext::MultiOutput( const std::string& name) const { @@ -884,7 +841,8 @@ class RuntimeInferShapeContext : public InferShapeContext { const RuntimeContext& ctx_; }; -static void CheckTensorNANOrInf(const std::string& name, +static void CheckTensorNANOrInf(const std::string& op_type, + const std::string& name, const framework::Tensor& tensor) { if (tensor.memory_size() == 0) { return; @@ -894,9 +852,9 @@ static void CheckTensorNANOrInf(const std::string& name, return; } PADDLE_ENFORCE(!framework::TensorContainsInf(tensor), - "Tensor %s contains Inf", name); + "Operator %s output Tensor %s contains Inf", op_type, name); PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor), - "Tensor %s contains NAN", name); + "Operator %s output Tensor %s contains NAN", op_type, name); } void OperatorWithKernel::RuntimeInferShape(const Scope& scope, @@ -906,6 +864,16 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, this->InferShape(&infer_shape_ctx); } +std::vector* OperatorWithKernel::GetKernelConfig( + const OpKernelType& key) const { + auto config_iter = kernel_configs_map_.find(key); + std::vector* kernel_configs = nullptr; + if (config_iter != kernel_configs_map_.end()) { + kernel_configs = &(config_iter->second); + } + return kernel_configs; +} + void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { RuntimeContext ctx(Inputs(), Outputs(), scope); @@ -923,7 +891,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, OpKernelMap& kernels = kernels_iter->second; auto expected_kernel_key = this->GetExpectedKernelType( - ExecutionContext(*this, scope, *dev_ctx, ctx)); + ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr)); VLOG(3) << "expected_kernel_key:" << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); @@ -942,6 +910,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope, KernelTypeToString(expected_kernel_key)); } + std::vector* kernel_configs = + GetKernelConfig(expected_kernel_key); + // do data transformScope &transfer_scope; std::vector transfered_inplace_vars; auto* transfer_scope = @@ -959,7 +930,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, this->InferShape(&infer_shape_ctx); // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext // not Scope. Imperative mode only pass inputs and get outputs. - kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx)); + kernel_iter->second( + ExecutionContext(*this, exec_scope, *dev_ctx, ctx, kernel_configs)); if (!transfered_inplace_vars.empty()) { // there is inplace variable has been transfered. @@ -976,9 +948,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope, auto* var = exec_scope.FindVar(vname); if (var == nullptr) continue; if (var->IsType()) { - CheckTensorNANOrInf(vname, var->Get()); + CheckTensorNANOrInf(type_, vname, var->Get()); } else if (var->IsType()) { - CheckTensorNANOrInf(vname, var->Get().value()); + CheckTensorNANOrInf(type_, vname, + var->Get().value()); } } } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e33214b44bb5d8ea5eb32d442d597a369c198bdd..55629636a816982c4debe4b5b7138558ac309eb5 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -16,9 +16,11 @@ limitations under the License. */ #include #include +#include #include #include #include +#include #include #include "glog/logging.h" // For VLOG @@ -28,6 +30,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor.h" @@ -184,12 +187,30 @@ class OperatorBase { const platform::Place& place) const = 0; }; +#ifdef PADDLE_WITH_CUDA +using KernelConfig = boost::variant< + std::shared_ptr>, + std::shared_ptr>, + std::shared_ptr>>; +#else +using KernelConfig = boost::variant; +#endif + +using OpKernelConfigsMap = + std::unordered_map, + OpKernelType::Hash>; + class ExecutionContext { public: ExecutionContext(const OperatorBase& op, const Scope& scope, const platform::DeviceContext& device_context, - const RuntimeContext& ctx) - : op_(op), scope_(scope), device_context_(device_context), ctx_(ctx) {} + const RuntimeContext& ctx, + std::vector* configs) + : op_(op), + scope_(scope), + device_context_(device_context), + ctx_(ctx), + kernel_configs_(configs) {} const OperatorBase& op() const { return op_; } @@ -234,31 +255,6 @@ class ExecutionContext { return it->second; } - const std::vector LegacyMultiInputVar( - const std::string& name) const { - auto names = op_.Inputs(name); - std::vector res; - res.reserve(names.size()); - std::transform(names.begin(), names.end(), std::back_inserter(res), - [this](const std::string& name) { - return name == kEmptyVarName ? nullptr - : scope_.FindVar(name); - }); - return res; - } - - std::vector LegacyMultiOutputVar(const std::string& name) const { - auto names = op_.Outputs(name); - std::vector res; - res.reserve(names.size()); - std::transform(names.begin(), names.end(), std::back_inserter(res), - [this](const std::string& name) { - return name == kEmptyVarName ? nullptr - : scope_.FindVar(name); - }); - return res; - } - template const T* Input(const std::string& name) const { auto* var = InputVar(name); @@ -271,22 +267,6 @@ class ExecutionContext { return var == nullptr ? nullptr : var->GetMutable(); } - template - const T* LegacyInput(const std::string& name) const { - auto* var = LegacyInputVar(name); - return var == nullptr ? nullptr : &var->Get(); - } - - template - T* LegacyOutput(const std::string& name) const { - auto var = LegacyOutputVar(name); - return var == nullptr ? nullptr : var->GetMutable(); - } - - const Variable* LegacyInputVar(const std::string& name) const; - - Variable* LegacyOutputVar(const std::string& name) const; - template const std::vector MultiInput(const std::string& name) const { auto it = ctx_.inputs.find(name); @@ -319,32 +299,6 @@ class ExecutionContext { return res; } - template - const std::vector LegacyMultiInput(const std::string& name) const { - auto names = op_.Inputs(name); - std::vector res; - res.reserve(names.size()); - std::transform(names.begin(), names.end(), std::back_inserter(res), - [&](const std::string& sub_name) -> const T* { - auto var = scope_.FindVar(sub_name); - return var == nullptr ? nullptr : &var->Get(); - }); - return res; - } - - template - std::vector LegacyMultiOutput(const std::string& name) const { - auto names = op_.Outputs(name); - std::vector res; - res.reserve(names.size()); - std::transform(names.begin(), names.end(), std::back_inserter(res), - [&](const std::string& sub_name) -> T* { - auto var = scope_.FindVar(sub_name); - return var == nullptr ? nullptr : var->GetMutable(); - }); - return res; - } - platform::Place GetPlace() const { return device_context_.GetPlace(); } template @@ -398,34 +352,32 @@ class ExecutionContext { return temp_tensor; } + template + T& GetKernelConfig(int idx) const { + PADDLE_ENFORCE(kernel_configs_ && kernel_configs_->size() > idx, + "%s selected kernel doesn't have kernel config %lu <= %d", + op_.Type().c_str(), kernel_configs_->size(), idx); + return *boost::get>(kernel_configs_->at(idx)); + } + private: const OperatorBase& op_; const Scope& scope_; const platform::DeviceContext& device_context_; const RuntimeContext& ctx_; + mutable std::vector* kernel_configs_; }; template <> const Tensor* ExecutionContext::Input(const std::string& name) const; -template <> -const Tensor* ExecutionContext::LegacyInput( - const std::string& name) const; - template <> const std::vector ExecutionContext::MultiInput( const std::string& name) const; -template <> -const std::vector ExecutionContext::LegacyMultiInput( - const std::string& name) const; - template <> Tensor* ExecutionContext::Output(const std::string& name) const; -template <> -Tensor* ExecutionContext::LegacyOutput(const std::string& name) const; - template <> std::vector ExecutionContext::MultiOutput( const std::string& name) const; @@ -483,6 +435,8 @@ class OperatorWithKernel : public OperatorBase { virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; + std::vector* GetKernelConfig(const OpKernelType& key) const; + protected: virtual OpKernelType GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, @@ -508,6 +462,9 @@ class OperatorWithKernel : public OperatorBase { void TransferInplaceVarsBack(const Scope& scope, const std::vector& inplace_vars, const Scope& exec_scope) const; + + protected: + mutable OpKernelConfigsMap kernel_configs_map_; }; extern bool OpSupportGPU(const std::string& op_type); diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h new file mode 100644 index 0000000000000000000000000000000000000000..c520c222350ceeef246dae756a7157872ae087fa --- /dev/null +++ b/paddle/fluid/framework/operator_kernel_configs.h @@ -0,0 +1,118 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +namespace paddle { +namespace framework { + +// Not thread-safe. Should be owned per-kernel. +template +class AlgorithmsCache { + public: + AlgorithmsCache() : search_times_(0) { hash_.clear(); } + // Caches the best algorithm for a given + // combination of tensor dimensions & compute data type. + TAlgorithm GetAlgorithm( + const std::vector& dims1, const std::vector& dims2, + const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, + int algorithmFlags, // can set for different data type + std::function gen_func); + + TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags, + std::function gen_func); + + private: + std::unordered_map hash_; + int search_times_; +}; + +template +TAlgorithm framework::AlgorithmsCache::GetAlgorithm( + const std::vector& dims1, const std::vector& dims2, + const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, int algorithmFlags, + std::function gen_func) { + int64_t seed = 0; + // Hash all of the inputs, use to try and look up a previously + // discovered algorithm, or fall back to generating a new one. + std::hash hashFn; + // do hash like boost + // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x + for (const auto num : dims1) { + seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + + for (const auto num : dims2) { + seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1; + } + + for (const auto num : strides) { + seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + + (seed >> 2) + 2; + } + + for (const auto num : paddings) { + seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + + (seed >> 2) + 3; + } + + for (const auto num : dilations) { + seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + + (seed >> 2) + 4; + } + + seed ^= hashFn(static_cast(algorithmFlags)) + 0x9e3779b9 + + (seed << 6) + (seed >> 2) + 5; + + if (seed == 0) return gen_func(); + + if (hash_.find(seed) == hash_.end()) { + TAlgorithm value = gen_func(); + hash_[seed] = value; + } + return hash_[seed]; +} + +template +TAlgorithm AlgorithmsCache::GetAlgorithm( + int64_t area, int search_times, int algorithmFlags, + std::function gen_func) { + if (hash_.find(area) != hash_.end()) { + return hash_[area]; + } + if (search_times_ < search_times) { + auto algo = gen_func(); + hash_[area] = algo; + ++search_times_; + return algo; + } + TAlgorithm algo; + int64_t min = static_cast(INT_MAX); + for (const auto& m : hash_) { + if (m.first < min) { + min = m.first; + algo = m.second; + } + } + return algo; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ff7ef0cce2f12fe89dd087c4a5006b2cfdc5a4a9..acc71396b441149988f654843482a9e292977de3 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/details/all_reduce_deps_pass.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" @@ -180,12 +181,14 @@ std::vector &ParallelExecutor::GetLocalScopes() { return member_->local_scopes_; } -ParallelExecutor::ParallelExecutor( - const std::vector &places, - const std::unordered_set &bcast_vars, - const ProgramDesc &main_program, const std::string &loss_var_name, - Scope *scope, const std::vector &local_scopes, - const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy) +ParallelExecutor::ParallelExecutor(const std::vector &places, + const std::vector &bcast_vars, + const std::string &loss_var_name, + Scope *scope, + const std::vector &local_scopes, + const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy, + ir::Graph *graph) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; member_->use_cuda_ = exec_strategy.use_cuda_; @@ -193,7 +196,6 @@ ParallelExecutor::ParallelExecutor( member_->use_all_reduce_ = build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; member_->nranks_ = build_strategy.num_trainers_ * places.size(); - if (!member_->use_all_reduce_) { PADDLE_ENFORCE(places.size() > 1, "If you set build_strategy.reduce with 'Reduce'," @@ -216,14 +218,17 @@ ParallelExecutor::ParallelExecutor( } } + std::unique_ptr temp_owned_graph(graph); + // FIXME(Yancey1989): parallel graph mode get better performance // in GPU allreduce distributed training. Need an elegant way to // choice the execution strategy. - build_strategy.enable_parallel_graph_ = - EnableParallelGraphExecution(main_program, exec_strategy, build_strategy); - - VLOG(1) << "Enable ParallelGraph Execution: " - << build_strategy.enable_parallel_graph_; + build_strategy.enable_parallel_graph_ = EnableParallelGraphExecution( + *temp_owned_graph, exec_strategy, build_strategy); + if (build_strategy.enable_parallel_graph_) + VLOG(0) << "The Executor would execute the graph by ParallelGraph " + "Execution which can get better performance," + << "you can force it off by env FLAGS_enable_parallel_graph=0"; if (member_->use_cuda_) { // Bcast Parameters to all GPUs @@ -250,67 +255,71 @@ ParallelExecutor::ParallelExecutor( PADDLE_THROW("Not compiled with CUDA"); #endif } - if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { - BCastParamsToDevices(bcast_vars); + // broadcast parameters from the 0th device to others: + auto need_broadcast = [&]() -> bool { + if (build_strategy.num_trainers_ > 1) { + // 1. num_tariners would be grater than 1 for nccl distributed training. + return true; + } else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { + // 2. Only one trainer process, but ParallelExecutor hold multiple + // devices. + return true; + } + return false; + }; + + if (need_broadcast()) { + BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_); } - // Startup Program has been run. All local scopes has correct parameters. - // Step 2. Convert main_program to SSA form and dependency graph. Also, insert - // ncclOp - std::vector> graphs; +// Startup Program has been run. All local scopes has correct parameters. + +// Step 2. Convert main_program to SSA form and dependency graph. Also, insert +// ncclOp #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - if (build_strategy.enable_parallel_graph_) { - for (size_t i = 0; i < member_->places_.size(); ++i) { - std::unique_ptr graph = build_strategy.Apply( - main_program, {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_, - member_->nccl_ctxs_.get()); - graphs.push_back(std::move(graph)); - } - } else { - std::unique_ptr graph = build_strategy.Apply( - main_program, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get()); - graphs.push_back(std::move(graph)); - } + + temp_owned_graph = build_strategy.Apply( + std::move(temp_owned_graph), member_->places_, loss_var_name, + member_->local_scopes_, member_->nranks_, member_->use_cuda_, + member_->nccl_ctxs_.get()); #else - std::unique_ptr graph = build_strategy.Apply( - main_program, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_cuda_); - graphs.push_back(std::move(graph)); + temp_owned_graph = build_strategy.Apply( + std::move(temp_owned_graph), member_->places_, loss_var_name, + member_->local_scopes_, member_->nranks_, member_->use_cuda_); + #endif auto max_memory_size = GetEagerDeletionThreshold(); VLOG(10) << "Eager Deletion Threshold " << static_cast(max_memory_size) / (1 << 30); if (max_memory_size >= 0) { - for (size_t i = 0; i < graphs.size(); ++i) { - graphs[i] = member_->PrepareGCAndRefCnts( - std::move(graphs[i]), static_cast(max_memory_size)); - } + graph = member_ + ->PrepareGCAndRefCnts(std::move(temp_owned_graph), + static_cast(max_memory_size)) + .release(); + } else { + graph = temp_owned_graph.release(); } // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars std::vector var_infos; - for (auto &graph : graphs) { - for (auto &node : graph->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); - } + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); } } // If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) { - size_t graph_num = ir::GraphNum(*graphs[0]); + size_t graph_num = ir::GraphNum(*graph); if (graph_num > 1) { LOG(WARNING) << "The number of graph should be only one, " "but the current graph has " - << ir::GraphNum(*graphs[0]) + << ir::GraphNum(*graph) << " sub_graphs. If you want to see the nodes of the " "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' " "to specify the output dir. NOTES: if you not do training, " @@ -319,18 +328,22 @@ ParallelExecutor::ParallelExecutor( } if (build_strategy.enable_parallel_graph_) { +#ifdef PADDLE_WITH_CUDA + // TODO(Yancey1989): Remove passing in the main_program when + // allreduce_seq_pass doesn't need it as the attr. member_->executor_.reset(new details::ParallelSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->places_, - std::move(graphs))); + exec_strategy, member_->local_scopes_, member_->places_, graph)); +#else + PADDLE_THROW( + "Paddle should be compiled with CUDA for ParallelGraph Execution."); +#endif } else { if (exec_strategy.type_ == ExecutionStrategy::kDefault) { member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->places_, - std::move(graphs[0]))); + exec_strategy, member_->local_scopes_, member_->places_, graph)); } else { member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->places_, - std::move(graphs[0]))); + exec_strategy, member_->local_scopes_, member_->places_, graph)); } } @@ -340,7 +353,7 @@ ParallelExecutor::ParallelExecutor( } void ParallelExecutor::BCastParamsToDevices( - const std::unordered_set &vars) const { + const std::vector &vars, int trainer_id) const { // the initializing bcast, all vars would be bcast from device(0). for (auto &var : vars) { framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); @@ -364,7 +377,7 @@ void ParallelExecutor::BCastParamsToDevices( auto place = member_->places_[i]; void *buffer; - if (i == 0) { + if (i == 0 && trainer_id == 0) { buffer = const_cast(main_tensor.data()); } else { auto local_scope = member_->local_scopes_[i]; @@ -460,43 +473,44 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( } } +ParallelExecutor::~ParallelExecutor() { + for (auto &p : member_->places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } + delete member_; +} + bool ParallelExecutor::EnableParallelGraphExecution( - const ProgramDesc &main_program, const ExecutionStrategy &exec_strategy, + const ir::Graph &graph, const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy) const { if (!FLAGS_enable_parallel_graph) return false; bool enable_parallel_graph = true; - // TODO(Yancey1989): support sparse update in ParallelGraph mode. - for (auto &var_desc : main_program.Block(0).AllVars()) { - if (var_desc->GetType() == proto::VarType::SELECTED_ROWS) { - enable_parallel_graph = false; - } - } - // TODO(Yancey1989): support pserver mode - for (auto &op_desc : main_program.Block(0).AllOps()) { - if (op_desc->Type() == "send" || op_desc->Type() == "recv") { - enable_parallel_graph = false; - break; + for (ir::Node *node : graph.Nodes()) { + if (node->IsVar() && node->Var()) { + // TODO(Yancey1989): support sparse update in ParallelGraph mode. + if (node->Var()->GetType() == proto::VarType::SELECTED_ROWS) { + enable_parallel_graph = false; + break; + } + } else if (node->IsOp() && node->Op()) { + // TODO(Yancey1989): support pserver mode + if (node->Op()->Type() == "send" || node->Op()->Type() == "recv") { + enable_parallel_graph = false; + break; + } } } if (!member_->use_all_reduce_ || !member_->use_cuda_) - enable_parallel_graph = false; - if (build_strategy.enable_sequential_execution_ || - exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental) - enable_parallel_graph = false; + if (build_strategy.enable_sequential_execution_ || + exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental) + enable_parallel_graph = false; return enable_parallel_graph; } -ParallelExecutor::~ParallelExecutor() { - for (auto &p : member_->places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); - } - delete member_; -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 121bbd55ad575477424a2fb12baab82585eae517..d4658b9623fe8c23b6a8b2903e3c48d794ba1652 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -14,9 +14,11 @@ limitations under the License. */ #pragma once +#include #include #include #include +#include #include #include "paddle/fluid/framework/details/build_strategy.h" @@ -45,12 +47,12 @@ class ParallelExecutor { public: explicit ParallelExecutor(const std::vector &places, - const std::unordered_set &bcast_vars, - const ProgramDesc &main_program, + const std::vector &bcast_vars, const std::string &loss_var_name, Scope *scope, const std::vector &local_scopes, const ExecutionStrategy &exec_strategy, - const BuildStrategy &build_strategy); + const BuildStrategy &build_strategy, + ir::Graph *graph); ~ParallelExecutor(); @@ -70,8 +72,11 @@ class ParallelExecutor { const std::string &fetched_var_name); private: - void BCastParamsToDevices(const std::unordered_set &vars) const; - bool EnableParallelGraphExecution(const ProgramDesc &main_program, + // broadcast the parameters from the 0th device. + // trainer_id the trainer index in nccl distributed training. + void BCastParamsToDevices(const std::vector &vars, + int trainer_id = 0) const; + bool EnableParallelGraphExecution(const ir::Graph &graph, const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy) const; diff --git a/paddle/fluid/framework/python_headers.h b/paddle/fluid/framework/python_headers.h index 422af19a13683dc9ae6377cac1b1ab2c2ac8f96b..8f9e3fad57f7bb87e78e334e741be23751417a78 100644 --- a/paddle/fluid/framework/python_headers.h +++ b/paddle/fluid/framework/python_headers.h @@ -24,3 +24,11 @@ limitations under the License. */ #pragma pop_macro("_XOPEN_SOURCE") #pragma pop_macro("_POSIX_C_SOURCE") + +#if !defined(PYBIND11_HIDDEN) +#ifdef _WIN32 +#define PYBIND11_HIDDEN __declspec(dllexport) +#else +#define PYBIND11_HIDDEN __attribute__((visibility("hidden"))) +#endif +#endif diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 40606d9b06baf4dbebf87f3c02580e49ae6e2a70..88f5b757a8111f6a7e269ff71054dab425c0de01 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -27,6 +27,10 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_utils.h" +#endif + namespace paddle { namespace framework { @@ -37,10 +41,34 @@ class Tensor { #ifdef PADDLE_WITH_MKLDNN public: - inline mkldnn::memory::format format() const { return format_; } + // TODO(jczaja): This is depracted and will be removed + inline mkldnn::memory::format format() const { + if (layout_ == DataLayout::kMKLDNN) { + return static_cast(mem_pd_.desc().data.format); + } else { + return mkldnn::memory::format::format_undef; + } + } - inline void set_format(const mkldnn::memory::format format) { - format_ = format; + // TODO(jczaja): This is depracted and will be removed + inline void set_format( + const mkldnn::memory::format fmt, + mkldnn::memory::data_type data_type = mkldnn::memory::f32) { + mem_pd_ = paddle::platform::create_prim_desc_from_format( + paddle::framework::vectorize2int(dims()), fmt, data_type); + layout_ = DataLayout::kMKLDNN; + } + + inline mkldnn::memory::primitive_desc get_mkldnn_prim_desc() const { + return mem_pd_; + } + + inline void set_mkldnn_prim_desc( + const mkldnn::memory::primitive_desc& mem_pd) { + // Internally MKL-DNN is just copying (increasing reference counter) + // to shared_ptr. So asignment should be quite cheap + mem_pd_ = mem_pd; + layout_ = DataLayout::kMKLDNN; } protected: @@ -48,12 +76,9 @@ class Tensor { * @brief the detail format of memory block which have layout as kMKLDNN * * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, - * nChw16c, etc. For a MKLDNN memory block, layout will be set as - * DataLayout::kMKLDNN meanwhile detail memory format will be kept in - * this field. + * nChw16c, etc. For a MKLDNN memory block, we store memory descriptor */ - - mkldnn::memory::format format_ = mkldnn::memory::format::format_undef; + mutable mkldnn::memory::primitive_desc mem_pd_; #endif public: diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 85d15c5d3faa5a3d021b12396f9f8ea7735f9148..a7f09df4917532e7261cee471c711897c8eb3447 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -14,8 +14,11 @@ #include "paddle/fluid/framework/tensor_util.h" #include #include +#include +#include #include #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -135,16 +138,19 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, #ifdef PADDLE_WITH_CUDA else if (platform::is_gpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { + platform::RecordEvent record_event("TensorCopy:GPU->CPU"); auto src_gpu_place = boost::get(src_place); auto dst_cpu_place = boost::get(dst_place); memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); } else if (platform::is_cpu_place(src_place) && platform::is_gpu_place(dst_place)) { + platform::RecordEvent record_event("TensorCopy:CPU->GPU"); auto src_cpu_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); } else if (platform::is_gpu_place(src_place) && platform::is_gpu_place(dst_place)) { + platform::RecordEvent record_event("TensorCopy:GPU->GPU"); if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) { VLOG(3) << "Skip copy the same data from " << src_place << " to " << dst_place; @@ -155,6 +161,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); } else if (platform::is_cuda_pinned_place(src_place) && platform::is_gpu_place(dst_place)) { + platform::RecordEvent record_event("TensorCopy:CUDAPinned->GPU"); auto src_pinned_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size, diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 733542e4972b16a71f9e76c3076b424b7a901066..fa77b96a7bdfa28ed982db022e8e5ecaef0b443c 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -50,8 +50,6 @@ class Scope; } // namespace framework namespace operators { -template -class AlgorithmsCache; class CudnnRNNCache; @@ -144,9 +142,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< #ifndef _WIN32 ncclUniqueId, platform::Communicator, #endif - operators::AlgorithmsCache, - operators::AlgorithmsCache, - operators::AlgorithmsCache, operators::CudnnRNNCache, #endif int, float>; diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 8f20f0c06e043ddc629e47c6e49280c5467b0e20..5530823b90f6580692456253b0eb9d0af4e3240b 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "paddle/fluid/framework/lod_tensor.h" @@ -139,6 +140,8 @@ class Autograd { } } } + + ready_op->InvokeBackwardHooks(); } } @@ -156,8 +159,9 @@ class Autograd { for (auto it : candidate->pre_ops_) { for (OpBase* pre_op : it.second) { if (!pre_op) continue; - VLOG(5) << "op dep " << candidate->op_desc_->Type() << " <---- " - << it.first << " <---- " << pre_op->op_desc_->Type(); + VLOG(5) << "op dep " << candidate->Type() << " trace id " + << candidate->trace_id_ << " <---- " << it.first << " <---- " + << pre_op->Type() << " trace id " << pre_op->trace_id_; if (visited.find(pre_op) == visited.end()) { visited.insert(pre_op); queue.push_back(pre_op); @@ -175,10 +179,12 @@ std::unique_ptr VarBase::NewVarBase(const platform::Place& dst_place, PADDLE_ENFORCE(var_->IsInitialized(), "Variable must be initialized when getting numpy tensor"); - std::unique_ptr new_var(new VarBase()); + // TODO(minqiyang): change this after move unique_name generator to CXX + const framework::LoDTensor& self_tensor = var_->Get(); + std::unique_ptr new_var(new VarBase( + "Itmp", self_tensor.type(), self_tensor.dims(), dst_place, true, false)); framework::LoDTensor* tensor = new_var->var_->GetMutable(); - tensor->Resize(var_->Get().dims()); tensor->set_lod(var_->Get().lod()); if (blocking) { @@ -194,51 +200,62 @@ std::unique_ptr VarBase::NewVarBase(const platform::Place& dst_place, } if (platform::is_gpu_place(dst_place)) { - VLOG(3) << "copy tensor " << var_desc_->Name() << " from gpu"; + VLOG(3) << "copy tensor " << Name() << " from gpu"; } return new_var; } framework::LoDTensor& VarBase::GradValue() { - VLOG(3) << "get var grad " << var_desc_->Name(); + VLOG(3) << "get var grad " << Name(); + PADDLE_ENFORCE_NOT_NULL(grads_, + "Could not get grad value from no grad variable"); return *(grads_->var_->GetMutable()); } std::map> OpBase::ApplyGrad() { if (grad_op_descs_.empty() && backward_id_ <= 0) { - VLOG(3) << "op with no grad: " << op_desc_->Type(); + VLOG(3) << "op with no grad: " << Type(); return {}; } - std::vector grad_outputs; + VLOG(3) << "apply op grad: " << Type(); + std::vector tmp_grad_outputs; if (backward_id_ > 0) { VLOG(3) << "py_layer_grad"; - grad_outputs.resize(1); - grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] = + tmp_grad_outputs.resize(1); + tmp_grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad( backward_id_, grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]); } else { - grad_outputs.resize(grad_op_descs_.size()); - for (size_t k = 0; k < grad_op_descs_.size(); ++k) { + const size_t grad_op_count = grad_op_descs_.size(); + + tmp_grad_outputs.resize(grad_op_count); + for (size_t k = 0; k < grad_op_count; ++k) { framework::OpDesc* grad_op_desc = grad_op_descs_[k]; - VLOG(3) << "op grad " << grad_op_desc->Type(); - for (auto it : grad_output_vars_[k]) { - auto& outputs = grad_outputs[k][it.first]; + auto& grad_output_variable_map = grad_output_vars_[k]; + + VLOG(3) << "apply grad op " << grad_op_desc->Type(); + + // Allocate tmp grad output variable + for (auto it : grad_output_variable_map) { + auto& outputs = tmp_grad_outputs[k][it.first]; + outputs.reserve(it.second.size()); for (size_t i = 0; i < it.second.size(); ++i) { // Allocate a new variable Variable* tmp_var = new framework::Variable(); tmp_var->GetMutable(); - outputs.push_back(tmp_var); + outputs.emplace_back(tmp_var); } } - framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]); + // Run grad op + framework::RuntimeContext ctx(grad_input_vars_[k], tmp_grad_outputs[k]); // No need to do compile time infer shape here. // grad_op_desc_->InferShape(*block_); - grad_op_desc->InferVarType(block_); + // grad_op_desc->InferVarType(block_); std::unique_ptr opbase = framework::OpRegistry::CreateOp(*grad_op_desc); @@ -249,13 +266,15 @@ std::map> OpBase::ApplyGrad() { framework::Scope scope; PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_); p.op.RuntimeInferShape(scope, place_, ctx); - p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); + p.func( + framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx, nullptr)); } } + // Add tmp grad outputs to original grad vars for (size_t k = 0; k < grad_output_vars_.size(); ++k) { for (auto it : grad_output_vars_[k]) { - auto& outputs = grad_outputs[k][it.first]; + auto& outputs = tmp_grad_outputs[k][it.first]; auto& origin_outputs = it.second; PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); @@ -271,6 +290,22 @@ std::map> OpBase::ApplyGrad() { return input_vars_; } +void OpBase::InvokeBackwardHooks() { + VLOG(3) << "call backward hooks, hooks num: " << backward_hooks_.size(); + + // call backward hooks + for (py::object& callable : backward_hooks_) { + callable(this); + } +} + +void OpBase::RegisterBackwardHooks(const py::object& callable) { + VLOG(3) << "Register backward hooks " << trace_id_; + + // TODO(minqiyang): check the callable format + backward_hooks_.push_back(callable); +} + void VarBase::RunBackward() { if (!pre_op_) return; @@ -293,19 +328,14 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) { int PyLayer::NumFuncs() { return py_funcs_.size(); } -std::vector PyLayer::Apply(int func_id, - const std::vector& inputs) { +std::vector PyLayer::Apply(int func_id, + const std::vector& inputs) { std::vector invars; for (const VarBase* in : inputs) { invars.push_back(in->var_); } PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end()); - std::vector outvars = CallPythonFunc(py_funcs_[func_id], invars); - std::vector ret; - for (Variable* v : outvars) { - ret.push_back(new VarBase(v, new VarBase(true))); - } - return ret; + return CallPythonFunc(py_funcs_[func_id], invars); } std::vector PyLayer::ApplyGrad( diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 78205486c5534ac0c61cc6d545bdafa4dfc95695..618a5b7a03295ce679dc6a88e0eac57069e78b8b 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -44,8 +44,13 @@ class PreparedOp { PreparedOp(const framework::OperatorBase& op, const framework::RuntimeContext& ctx, framework::OperatorWithKernel::OpKernelFunc func, - platform::DeviceContext* dev_ctx) - : op(op), ctx(ctx), func(func), dev_ctx(dev_ctx) {} + platform::DeviceContext* dev_ctx, + std::vector* kernel_configs) + : op(op), + ctx(ctx), + func(func), + dev_ctx(dev_ctx), + kernel_configs(kernel_configs) {} static PreparedOp Prepare(const framework::RuntimeContext& ctx, const framework::OperatorWithKernel& op, @@ -64,8 +69,9 @@ class PreparedOp { framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second; - auto expected_kernel_key = op.GetExpectedKernelType( - framework::ExecutionContext(op, framework::Scope(), *dev_ctx, ctx)); + auto expected_kernel_key = + op.GetExpectedKernelType(framework::ExecutionContext( + op, framework::Scope(), *dev_ctx, ctx, nullptr)); VLOG(3) << "expected_kernel_key:" << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); @@ -83,7 +89,9 @@ class PreparedOp { PADDLE_THROW("op %s does not have kernel for %s", op.Type(), KernelTypeToString(expected_kernel_key)); } - return PreparedOp(op, ctx, kernel_iter->second, dev_ctx); + std::vector* kernel_configs = + op.GetKernelConfig(expected_kernel_key); + return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs); } inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; } @@ -92,6 +100,7 @@ class PreparedOp { const framework::RuntimeContext& ctx; framework::OperatorWithKernel::OpKernelFunc func; platform::DeviceContext* dev_ctx; + std::vector* kernel_configs; }; class OpBase; @@ -103,43 +112,101 @@ class OpBase; */ class VarBase { public: - VarBase() : VarBase(new framework::Variable(), new VarBase(true)) {} + // Internal interface, create VarBase from exist variable + VarBase(const std::string& name, framework::Variable* var, VarBase* grad, + bool stop_gradient) + : VarBase(name, var->Get().type(), + var->Get().dims(), + var->Get().place(), var, grad, + stop_gradient, false) {} + + // Python interface + VarBase(const std::string& name, const framework::proto::VarType::Type dtype, + const std::vector& shape, const platform::Place& place, + bool stop_gradient, bool persistable) + : VarBase(name, dtype, framework::make_ddim(shape), place, stop_gradient, + persistable) {} + + // Internal interface, create VarBase from with ddim + VarBase(const std::string& name, const framework::proto::VarType::Type dtype, + const framework::DDim& shape, const platform::Place& place, + bool stop_gradient, bool persistable) + : VarBase(name, dtype, shape, place, nullptr, nullptr, stop_gradient, + persistable) {} - // Owns `var` and `grad` - VarBase(framework::Variable* var, VarBase* grad) - : var_desc_(nullptr), + private: + VarBase(const std::string& name, framework::proto::VarType::Type dtype, + const framework::DDim& shape, const platform::Place& place, + framework::Variable* var, VarBase* grad, bool stop_gradient, + bool persistable) + : name_(name), + dtype_(dtype), + place_(place), var_(var), grads_(grad), - stop_gradient_(false), - pre_op_(nullptr), - pre_op_out_idx_(-1) {} - - explicit VarBase(bool stop_gradient) - : var_desc_(nullptr), - var_(new framework::Variable()), - grads_(stop_gradient ? nullptr : new VarBase(true)), stop_gradient_(stop_gradient), + persistable_(persistable), pre_op_(nullptr), - pre_op_out_idx_(-1) {} + pre_op_out_name_(), + pre_op_out_idx_(-1) { + if (!var_) { + var_ = new framework::Variable(); + auto tensor = var_->GetMutable(); + tensor->Resize(shape); + tensor->mutable_data(place_, dtype_); + } + } + public: virtual ~VarBase() { if (var_) { delete var_; + var_ = nullptr; } if (grads_) { delete grads_; + grads_ = nullptr; } + + pre_op_ = nullptr; + pre_op_out_idx_ = -1; + } + + inline void SetName(const std::string& name) { name_ = name; } + inline std::string Name() const { return name_; } + + inline std::vector Shape() const { + if (var_->IsInitialized()) { + return framework::vectorize(var_->Get().dims()); + } else { + return {}; + } + } + + inline framework::proto::VarType::Type DType() const { return dtype_; } + + inline void SetStopGradient(bool stop_gradient) { + stop_gradient_ = stop_gradient; } + inline bool IsStopGradient() const { return stop_gradient_; } - OpBase* PreOp() const { return pre_op_; } - int PreOpOutIdx() const { return pre_op_out_idx_; } + inline void SetPersistable(bool persistable) { persistable_ = persistable; } + inline bool IsPersistable() const { return persistable_; } - void SetStopGradient(bool stop_gradient) { stop_gradient_ = stop_gradient; } - bool IsStopGradient() const { return stop_gradient_; } + inline OpBase* PreOp() const { return pre_op_; } + inline int PreOpOutIdx() const { return pre_op_out_idx_; } void RunBackward(); + inline void ResetPreOp(OpBase* op) { + if (op == pre_op_) { + // clear pre_op info when op equals to var's pre_op + pre_op_ = nullptr; + pre_op_out_idx_ = -1; + } + } + void TrackPreOp(OpBase* pre_op, const std::string& pre_op_out_name, int pre_op_out_idx, bool pre_op_stop_gradient) { pre_op_ = pre_op; @@ -151,7 +218,7 @@ class VarBase { } void ClearGradient() { - VLOG(1) << "clear gradient of " << var_desc_->Name(); + VLOG(1) << "clear gradient of " << Name(); if (grads_ && grads_->var_ && grads_->var_->IsInitialized()) { auto grads_t = grads_->var_->GetMutable(); operators::math::set_constant( @@ -167,19 +234,20 @@ class VarBase { const bool blocking) const; inline std::string GradName() const { - PADDLE_ENFORCE( - var_desc_, - "Couldn't get gradient variable's name, please call backward() first"); - return string::Sprintf("%s@IGrad", var_desc_->Name()); + return string::Sprintf("%s@IGrad", Name()); } - framework::VarDesc* var_desc_; + std::string name_; + framework::proto::VarType::Type dtype_; + platform::Place place_; framework::Variable* var_; VarBase* grads_; private: bool stop_gradient_; + bool persistable_; + OpBase* pre_op_; std::string pre_op_out_name_; int pre_op_out_idx_; @@ -188,15 +256,27 @@ class VarBase { /* The wrapper for OpDesc which holds a OpDesc and a OpDesc of its * gradient. This object should be managed totally by Python intepreter. */ -class OpBase { +class PYBIND11_HIDDEN OpBase { public: - OpBase() - : op_desc_(nullptr), + OpBase(const std::string& type) + : type_(type), + trace_id_(-1), forward_id_(-1), backward_id_(-1), - place_(platform::CPUPlace()) {} + place_(platform::CPUPlace()), + backward_hooks_() {} virtual ~OpBase() { + // TODO(minqiyang): remove op_desc from block_desc in tracer + // + // reset all output vars' pre op + for (auto iter : output_vars_) { + for (VarBase* var : iter.second) { + var->ResetPreOp(this); + } + } + + // release resource for (framework::OpDesc* desc : grad_op_descs_) { delete desc; } @@ -204,9 +284,34 @@ class OpBase { std::map> ApplyGrad(); - // One of `op_desc_` or `forward_id_` is set, not both. - // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_. - framework::OpDesc* op_desc_; + inline std::string Type() const { return type_; } + inline std::string GradOpType(size_t index) const { + PADDLE_ENFORCE_NOT_NULL(grad_op_descs_[index]); + return grad_op_descs_[index]->Type(); + } + + void RegisterBackwardHooks(const py::object& callable); + + void InvokeBackwardHooks(); + + void TrackPreOp(const VarBase* inp_var, const std::string& inp_name) { + if (inp_var->PreOp() && !inp_var->IsStopGradient()) { + VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot " + << inp_name; + pre_ops_[inp_name].push_back(inp_var->PreOp()); + pre_ops_out_idx_[inp_name].push_back(inp_var->PreOpOutIdx()); + } else { + VLOG(3) << "no pre op in slot " << inp_name + << " input var stop_gradient: " << inp_var->IsStopGradient(); + pre_ops_[inp_name].push_back(nullptr); + // pre_ops_out_idx_[inp_name].push_back(-1); + } + } + + std::string type_; + // One of `trace_id_` or `forward_id_` is set, not both. + // For pure python PyLayer, use `forward_id_`, otherwise, use trace_id_. + int trace_id_; int forward_id_; // When has backward, one of `grad_op_descs_` or `backward_id_` is set, @@ -227,7 +332,7 @@ class OpBase { // Outputs to a vector of bwd ops. std::vector grad_output_vars_; - framework::BlockDesc* block_; + std::vector backward_hooks_; }; class Layer { @@ -251,8 +356,8 @@ class PyLayer { static int NumFuncs(); - static std::vector Apply(int func_id, - const std::vector& inputs); + static std::vector Apply( + int func_id, const std::vector& inputs); static std::vector ApplyGrad( int func_id, const std::vector& inputs); diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index bc39d11ba00a6a7c386162a1f9201c6f992c8692..7ee92b4d8c46d8814400dbc02847d701005f3d5b 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -14,13 +14,32 @@ #include "paddle/fluid/imperative/tracer.h" +#include +#include +#include +#include + #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" +#ifdef WITH_GPERFTOOLS +#include "gperftools/profiler.h" +#endif + +DEFINE_string( + tracer_profile_fname, "", + "Profiler filename for imperative tracer, which generated by gperftools." + "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable."); + namespace paddle { namespace imperative { +static std::once_flag gTracerProfileOnce; +#ifdef WITH_GPERFTOOLS +static bool gTracerProfilerStarted = false; +#endif + void CreateGradOp(const framework::OpDesc& op_desc, const std::unordered_set& no_grad_set, const std::vector& grad_sub_block, @@ -37,15 +56,19 @@ void CreateGradOp(const framework::OpDesc& op_desc, } } -void InitVar(framework::Variable* var, framework::Variable* grad_var, - platform::DeviceContext* dev_ctx) { +void InitGrad(VarBase* var, platform::DeviceContext* dev_ctx) { + PADDLE_ENFORCE_NOT_NULL(var, "Could not get valid var base"); PADDLE_ENFORCE_NOT_NULL(dev_ctx, "Could not get valid device from forward op"); - auto& var_t = var->Get(); - grad_var->GetMutable()->mutable_data( - var_t.dims(), dev_ctx->GetPlace()); - operators::math::set_constant( - *dev_ctx, grad_var->GetMutable(), 0.0); + + if (var->grads_ == nullptr) { + auto& var_t = var->var_->Get(); + var->grads_ = new VarBase(var->GradName(), framework::proto::VarType::FP32, + framework::vectorize(var_t.dims()), + dev_ctx->GetPlace(), true, false); + auto grad_t = var->grads_->var_->GetMutable(); + operators::math::set_constant(*dev_ctx, grad_t, 0.0); + } } platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { @@ -66,40 +89,108 @@ platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { return result; } -void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, - const VarBasePtrMap& outputs, framework::BlockDesc* block, - const platform::Place expected_place, - const bool stop_gradient) { - std::map vars; +framework::VariableNameMap CreateInputVarNameMap( + const OpBase* op, const VarBasePtrMap& varbase_map) { + framework::VariableNameMap result; - framework::OpDesc* op_desc = op->op_desc_; - VLOG(3) << "tracer tracing " << op_desc->Type(); - op_desc->InferShape(*block); - op_desc->InferVarType(block); - std::unique_ptr op_base = - framework::OpRegistry::CreateOp(*op_desc); + auto& info_map = framework::OpInfoMap::Instance(); + auto* op_info = info_map.GetNullable(op->Type()); + if (op_info == nullptr || op_info->proto_ == nullptr) { + return result; + } + + for (auto& in : op_info->Proto().inputs()) { + auto it = varbase_map.find(in.name()); + if (it == varbase_map.end()) { + PADDLE_ENFORCE(in.dispensable()); + result[in.name()] = {}; + } else { + auto var_vector = it->second; + std::vector args; + args.reserve(var_vector.size()); + for (VarBase* var_base : var_vector) { + args.emplace_back(var_base->Name()); + } + result[in.name()] = args; + } + } + return result; +} + +framework::VariableNameMap CreateOutputVarNameMap( + const OpBase* op, const VarBasePtrMap& varbase_map) { + framework::VariableNameMap result; + + auto& info_map = framework::OpInfoMap::Instance(); + auto* op_info = info_map.GetNullable(op->Type()); + if (op_info == nullptr || op_info->proto_ == nullptr) { + return result; + } + + for (auto& out : op_info->Proto().outputs()) { + auto it = varbase_map.find(out.name()); + if (it == varbase_map.end()) { + PADDLE_ENFORCE(out.dispensable()); + result[out.name()] = {}; + } else { + auto var_vector = it->second; + std::vector args; + args.reserve(var_vector.size()); + for (VarBase* var_base : var_vector) { + args.emplace_back(var_base->Name()); + } + result[out.name()] = args; + } + } + return result; +} + +Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { + if (!FLAGS_tracer_profile_fname.empty()) { + std::call_once(gTracerProfileOnce, [] { +#ifdef WITH_GPERFTOOLS + ProfilerStart(FLAGS_tracer_profile_fname.c_str()); + gTracerProfilerStarted = true; +#else + LOG(WARNING) << "Paddle is not compiled with gperftools. " + "FLAGS_tracer_profile_fname will be ignored"; +#endif + }); + } +} + +std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, + const VarBasePtrMap& outputs, + framework::AttributeMap attrs_map, + const platform::Place expected_place, + const bool stop_gradient) { +#ifdef WITH_GPERFTOOLS + if (gTracerProfilerStarted) { + ProfilerFlush(); + } +#endif framework::VariableValueMap invars_map; framework::VariableValueMap outvars_map; + // Construct input_vars_map and output_vars_map + std::map current_vars_map; op->input_vars_ = inputs; for (auto it : op->input_vars_) { auto& invars = invars_map[it.first]; invars.reserve(it.second.size()); for (VarBase* inp : it.second) { - PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", - op->op_desc_->Type(), inp->var_desc_->Name()); + PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", op->Type(), + inp->Name()); invars.emplace_back(inp->var_); - vars[inp->var_desc_->Name()] = inp; - if (inp->PreOp()) { - op->pre_ops_[it.first].push_back(inp->PreOp()); - op->pre_ops_out_idx_[it.first].push_back(inp->PreOpOutIdx()); - } else { - op->pre_ops_[it.first].push_back(nullptr); + op->TrackPreOp(inp, it.first); + if (!stop_gradient) { + current_vars_map[inp->Name()] = inp; } - VLOG(3) << "input vname " << inp->var_desc_->Name() << " " - << inp->var_->IsInitialized(); + VLOG(3) << "input var name: " << inp->Name() + << " inited: " << inp->var_->IsInitialized() + << " stop_grad: " << inp->IsStopGradient(); } } @@ -108,25 +199,38 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, auto& outvars = outvars_map[it.first]; const std::vector& outputs = it.second; outvars.reserve(outputs.size()); - for (size_t i = 0; i < outputs.size(); ++i) { + for (size_t i = 0U; i < outputs.size(); ++i) { VarBase* out = outputs[i]; outvars.emplace_back(out->var_); - vars[out->var_desc_->Name()] = out; - - framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name()); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - out->var_->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; - } out->TrackPreOp(op, it.first, i, stop_gradient); + if (!stop_gradient) { + current_vars_map[out->Name()] = out; + } - VLOG(3) << "output vname " << out->var_desc_->Name() << " " - << out->var_->IsInitialized(); + VLOG(3) << "input var name: " << out->Name() + << " inited: " << out->var_->IsInitialized() + << " stop_grad: " << out->IsStopGradient(); } } - VLOG(3) << "tracer running " << op_desc->Type(); + // Check attrs and create op + framework::VariableNameMap invars_name_map = + CreateInputVarNameMap(op, inputs); + framework::VariableNameMap outvars_name_map = + CreateOutputVarNameMap(op, outputs); + + auto& info = framework::OpInfoMap::Instance().Get(op->Type()); + if (info.Checker() != nullptr) { + info.Checker()->Check(&attrs_map); + } + + std::unique_ptr op_base = + framework::OpRegistry::CreateOp(op->Type(), invars_name_map, + outvars_name_map, attrs_map); + + // TODO(minqiyang): Support infer var type in imperative mode + // Run forward op + VLOG(3) << "tracer running " << op->Type(); framework::RuntimeContext ctx(invars_map, outvars_map); // TODO(panyx0718): Cache p. @@ -138,84 +242,99 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, op->place_ = GetExpectedPlace(expected_place, inputs); PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_); prepared_op.op.RuntimeInferShape(scope, op->place_, ctx); - prepared_op.func(framework::ExecutionContext( - prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx)); + prepared_op.func( + framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx, + prepared_op.ctx, prepared_op.kernel_configs)); + // construct backward op + std::set vars_saved_for_backward; if (!stop_gradient) { + VLOG(5) << "start construct backward op"; + + // construct grad op descs + std::unique_ptr fwd_op_desc(new framework::OpDesc( + op->Type(), invars_name_map, outvars_name_map, attrs_map)); std::unique_ptr> grad_to_var( new std::unordered_map()); - CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get()); + // NOTE(minqiyang): We don't support control flow op in imperative now + // Add grad_block_ when we want to support it + CreateGradOp(*fwd_op_desc, {}, {}, &op->grad_op_descs_, grad_to_var.get()); + + VLOG(5) << "create grad op desc: " << op->grad_op_descs_[0]->Type(); - op->grad_input_vars_.resize(op->grad_op_descs_.size()); - op->grad_output_vars_.resize(op->grad_op_descs_.size()); - for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) { + const size_t grad_op_count = op->grad_op_descs_.size(); + + op->grad_input_vars_.resize(grad_op_count); + op->grad_output_vars_.resize(grad_op_count); + + for (size_t i = 0; i < grad_op_count; ++i) { framework::OpDesc* grad_op_desc = op->grad_op_descs_[i]; for (auto it : grad_op_desc->Inputs()) { auto& grad_in_vars = op->grad_input_vars_[i][it.first]; + grad_in_vars.reserve(it.second.size()); for (const std::string& grad_invar : it.second) { - block->FindRecursiveOrCreateVar(grad_invar); auto var_it = grad_to_var->find(grad_invar); if (var_it == grad_to_var->end()) { - auto fwd_var_it = vars.find(grad_invar); - PADDLE_ENFORCE(fwd_var_it != vars.end()); + auto fwd_var_it = current_vars_map.find(grad_invar); + PADDLE_ENFORCE(fwd_var_it != current_vars_map.end()); // Forward inputs or outputs. - grad_in_vars.push_back(fwd_var_it->second->var_); + grad_in_vars.emplace_back(fwd_var_it->second->var_); } else { - VarBase* var = vars[var_it->second]; - if (!var->grads_->var_->IsInitialized()) { - InitVar(var->var_, var->grads_->var_, - prepared_op.GetDeviceContext()); - } + VarBase* var = current_vars_map[var_it->second]; + InitGrad(var, prepared_op.GetDeviceContext()); // Douts. - grad_in_vars.push_back(var->grads_->var_); + grad_in_vars.emplace_back(var->grads_->var_); } + + vars_saved_for_backward.insert(it.first); } } for (auto it : grad_op_desc->Outputs()) { auto& grad_out_vars = op->grad_output_vars_[i][it.first]; for (const std::string& grad_outvar : it.second) { - block->FindRecursiveOrCreateVar(grad_outvar); auto var_it = grad_to_var->find(grad_outvar); PADDLE_ENFORCE(var_it != grad_to_var->end(), "Could not found the grad op output var, should this " "operator %s's stop gradient be True", - op_desc->Type()); - VarBase* var = vars[var_it->second]; - if (!var->grads_->var_->IsInitialized()) { - InitVar(var->var_, var->grads_->var_, - prepared_op.GetDeviceContext()); - } + op->Type()); + VarBase* var = current_vars_map[var_it->second]; + InitGrad(var, prepared_op.GetDeviceContext()); grad_out_vars.push_back(var->grads_->var_); } } } } - op->block_ = block; + return vars_saved_for_backward; } std::vector Tracer::PyTrace(OpBase* op, const std::vector& inputs, bool stop_gradient) { - VLOG(3) << "py_trace"; + VLOG(3) << "py_trace " << op->Type(); + op->input_vars_[PyLayer::kFwdInp] = inputs; - op->output_vars_[PyLayer::kFwdOut] = PyLayer::Apply(op->forward_id_, inputs); + + std::vector ret_vars = + PyLayer::Apply(op->forward_id_, inputs); + for (VarBase* inp : inputs) { - if (inp->PreOp()) { - op->pre_ops_[PyLayer::kFwdInp].push_back(inp->PreOp()); - op->pre_ops_out_idx_[PyLayer::kFwdInp].push_back(inp->PreOpOutIdx()); - } else { - op->pre_ops_[PyLayer::kFwdInp].push_back(nullptr); - } + op->TrackPreOp(inp, PyLayer::kFwdInp); } - auto& outputs = op->output_vars_[PyLayer::kFwdOut]; - for (size_t i = 0; i < outputs.size(); ++i) { - VarBase* out = outputs[i]; + std::vector& outputs = op->output_vars_[PyLayer::kFwdOut]; + outputs.reserve(ret_vars.size()); + for (size_t i = 0U; i != ret_vars.size(); ++i) { + framework::Variable* v = ret_vars[i]; + VarBase* out = new VarBase(string::Sprintf("%s_out_%d", op->Type(), i), v, + nullptr, stop_gradient); + outputs.emplace_back(out); out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient); } + if (!stop_gradient) { + VLOG(5) << "start construct backward op"; op->grad_input_vars_.resize(1); op->grad_output_vars_.resize(1); auto& grad_input_vars = @@ -230,23 +349,16 @@ std::vector Tracer::PyTrace(OpBase* op, grad_input_vars.push_back(out->var_); } + // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now platform::CPUPlace place; for (VarBase* out : outputs) { + InitGrad(out, platform::DeviceContextPool::Instance().Get(place)); grad_input_vars.push_back(out->grads_->var_); - if (!grad_input_vars.back()->IsInitialized()) { - // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now - InitVar(out->var_, grad_input_vars.back(), - platform::DeviceContextPool::Instance().Get(place)); - } } - for (const VarBase* inp : inputs) { + for (VarBase* inp : inputs) { + InitGrad(inp, platform::DeviceContextPool::Instance().Get(place)); grad_output_vars.push_back(inp->grads_->var_); - if (!grad_output_vars.back()->IsInitialized()) { - // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now - InitVar(inp->var_, grad_output_vars.back(), - platform::DeviceContextPool::Instance().Get(place)); - } } } return outputs; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 690838215581b09ff35a0ea13f30655b77e6e187..7b65d55e9eff1444d84a3fba284ecbb8b47d1733 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -15,7 +15,10 @@ #pragma once #include +#include #include +#include +#include #include #include "paddle/fluid/framework/op_desc.h" @@ -33,20 +36,22 @@ void CreateGradOp(const framework::OpDesc& op_desc, framework::OpDesc** grad_op_desc, std::unordered_map* grad_to_var); -void InitVar(framework::Variable* var, framework::Variable* grad_var); +void InitVar(const VarBase* var, framework::Variable* grad_var, + platform::DeviceContext* dev_ctx); platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs); class Tracer { public: - explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {} + explicit Tracer(framework::BlockDesc* root_block); virtual ~Tracer() {} - void Trace(OpBase* op, const VarBasePtrMap& inputs, - const VarBasePtrMap& outputs, framework::BlockDesc* block, - const platform::Place expected_place, - const bool stop_gradient = false); + std::set Trace(OpBase* op, const VarBasePtrMap& inputs, + const VarBasePtrMap& outputs, + framework::AttributeMap attrs_map, + const platform::Place expected_place, + const bool stop_gradient = false); std::vector PyTrace(OpBase* op, const std::vector& inputs, bool stop_gradient = false); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 157862016e3556902f6507e02417624363ed1029..762640d6d1ce12dff511fc7149e872efa834036c 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -16,6 +16,7 @@ add_subdirectory(utils) if (TENSORRT_FOUND) add_subdirectory(tensorrt) endif() +# add_subdirectory(anakin) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES) diff --git a/paddle/fluid/inference/anakin/CMakeLists.txt b/paddle/fluid/inference/anakin/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b418af62f8cae4513bcca24f057d1fe100bbea25 --- /dev/null +++ b/paddle/fluid/inference/anakin/CMakeLists.txt @@ -0,0 +1,4 @@ +cc_library(anakin_engine SRCS engine.cc) +target_link_libraries(anakin_engine anakin anakin_saber_common) +cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine) +add_subdirectory(convert) diff --git a/paddle/fluid/inference/anakin/convert/CMakeLists.txt b/paddle/fluid/inference/anakin/convert/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5bfee861f14877b5a67bc48aeb14b8213a27370 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt @@ -0,0 +1,2 @@ +cc_library(anakin_op_converter SRCS fc.cc registrar.cc DEPS anakin_engine framework_proto scope) +cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op) diff --git a/paddle/fluid/inference/anakin/convert/fc.cc b/paddle/fluid/inference/anakin/convert/fc.cc new file mode 100644 index 0000000000000000000000000000000000000000..33a5aff1de2851ad55c2df83cc48ba86f8ded754 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/fc.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/fc.h" +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::Precision; +using anakin::saber::NV; +using anakin::saber::X86; +using anakin::saber::Shape; +using anakin::PBlock; +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +void FcOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto x_name = op_desc.Input("X").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + auto *y_v = scope.FindVar(op_desc.Input("Y").front()); + PADDLE_ENFORCE_NOT_NULL(y_v); + auto *y_t = y_v->GetMutable(); + + auto input_name = op_desc.Input("X").front(); + auto output_name = op_desc.Output("Out").front(); + + auto weight_shape = framework::vectorize2int(y_t->dims()); + engine_->AddOp(op_name, "Dense", {input_name}, {output_name}); + engine_->AddOpAttr(op_name, "bias_term", false); + engine_->AddOpAttr(op_name, "axis", 1); + int out_dim = weight_shape[1]; + engine_->AddOpAttr(op_name, "out_dim", out_dim); + + weight_shape.push_back(1); + weight_shape.push_back(1); + Shape anakin_shape(weight_shape); + + framework::LoDTensor weight_tensor; + weight_tensor.Resize(y_t->dims()); + TensorCopySync((*y_t), platform::CPUPlace(), &weight_tensor); + + auto *weight1 = + GraphGlobalMem::Global().template new_block(anakin_shape); + float *cpu_data = static_cast(weight1->h_tensor().mutable_data()); + std::copy_n(weight_tensor.data(), weight_tensor.numel(), cpu_data); + weight1->d_tensor().set_shape(anakin_shape); + weight1->d_tensor().copy_from(weight1->h_tensor()); + engine_->AddOpAttr(op_name, "weight_1", *weight1); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/fc.h b/paddle/fluid/inference/anakin/convert/fc.h new file mode 100644 index 0000000000000000000000000000000000000000..b670486f12b36043a01ceb002da8756901ed01ce --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/fc.h @@ -0,0 +1,38 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class FcOpConverter : public AnakinOpConverter { + public: + FcOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~FcOpConverter() {} + + private: +}; + +static Registrar register_fc_op_converter("fc"); +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h new file mode 100644 index 0000000000000000000000000000000000000000..b9a221079dcec78fc86ebed7dfac0c59ec0f8540 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/op_converter.h @@ -0,0 +1,112 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "framework/core/types.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/anakin/convert/registrar.h" +#include "paddle/fluid/inference/anakin/engine.h" +#include "paddle/fluid/inference/utils/singleton.h" +#include "saber/saber_types.h" + +namespace paddle { +namespace inference { +namespace anakin { + +using AnakinNvEngine = + AnakinEngine<::anakin::saber::NV, ::anakin::Precision::FP32>; + +class AnakinOpConverter { + public: + AnakinOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, bool test_mode) {} + void ConvertOp(const framework::proto::OpDesc &op, + const std::unordered_set ¶meters, + const framework::Scope &scope, AnakinNvEngine *engine, + bool test_mode = false) { + framework::OpDesc op_desc(op, nullptr); + std::string op_type = op_desc.Type(); + std::shared_ptr it{nullptr}; + + if (op_type == "mul") { + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL); + std::string Y = op_desc.Input("Y")[0]; + std::cout << Y << parameters.count(Y) << std::endl; + if (parameters.count(Y)) { + it = OpRegister::instance()->Get("fc"); + } + } + + if (!it) { + it = OpRegister::instance()->Get(op_type); + } + PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_type); + it->SetEngine(engine); + (*it)(op, scope, test_mode); + } + + void ConvertBlock(const framework::proto::BlockDesc &block, + const std::unordered_set ¶meters, + const framework::Scope &scope, AnakinNvEngine *engine) { + std::unique_lock lock(mutex_); + for (auto i = 0; i < block.ops_size(); i++) { + auto &op = block.ops(i); + ConvertOp(op, parameters, scope, engine); + } + } + void SetEngine(AnakinNvEngine *engine) { engine_ = engine; } + virtual ~AnakinOpConverter() {} + + protected: + bool test_mode_; + AnakinNvEngine *engine_{nullptr}; + + private: + std::unordered_map converters_; + framework::Scope *scope_{nullptr}; + std::mutex mutex_; +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle + +#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ + struct anakin_##op_type__##_converter \ + : public ::paddle::framework::Registrar { \ + anakin_##op_type__##_converter() { \ + ::paddle::inference:: \ + Registry::Register< \ + ::paddle::inference::anakin::Converter__>(#op_type__); \ + } \ + }; \ + anakin_##op_type__##_converter anakin_##op_type__##_converter__; \ + int TouchConverterRegister_anakin_##op_type__() { \ + anakin_##op_type__##_converter__.Touch(); \ + return 0; \ + } + +#define USE_ANAKIN_CONVERTER(op_type__) \ + extern int TouchConverterRegister_anakin_##op_type__(); \ + static int use_op_converter_anakin_##op_type__ __attribute__((unused)) = \ + TouchConverterRegister_anakin_##op_type__(); diff --git a/paddle/fluid/framework/details/eager_deletion_pass.h b/paddle/fluid/inference/anakin/convert/registrar.cc similarity index 61% rename from paddle/fluid/framework/details/eager_deletion_pass.h rename to paddle/fluid/inference/anakin/convert/registrar.cc index d7a7a9709d970841060778806451bc21cb2c7571..701ebdb2d43cf524330f946ac56d32dfa884f42a 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.h +++ b/paddle/fluid/inference/anakin/convert/registrar.cc @@ -12,21 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#pragma once - -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/anakin/convert/registrar.h" namespace paddle { -namespace framework { -namespace details { +namespace inference { +namespace anakin { + +std::shared_ptr OpRegister::Get(const std::string &name) { + auto it = registry_.find(name); + if (it == registry_.end()) return nullptr; + return it->second(); +} -class EagerDeletionPass : public ir::Pass { - protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; -}; +OpRegister *OpRegister::instance() { + static OpRegister factory; + return &factory; +} -} // namespace details -} // namespace framework +} // namespace anakin +} // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/registrar.h b/paddle/fluid/inference/anakin/convert/registrar.h new file mode 100644 index 0000000000000000000000000000000000000000..afce66ca084143ae203af9a60089aa2f5d18a725 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/registrar.h @@ -0,0 +1,58 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +namespace paddle { +namespace inference { +namespace anakin { + +class AnakinOpConverter; + +class OpRegister { + public: + OpRegister() = default; + std::shared_ptr Get(const std::string &name); + static OpRegister *instance(); + void OpRegisterFn(const std::string &name, + std::function()> fn) { + registry_[name] = fn; + } + + private: + using RegisterFnType = std::function()>; + std::map()>> + registry_; +}; + +template +class Registrar { + public: + Registrar(const std::string &name, Args... args) { + std::shared_ptr converter = + std::make_shared(std::move(args)...); + OpRegister::instance()->OpRegisterFn(name, + [converter]() { return converter; }); + } +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/test_fc_op.cc b/paddle/fluid/inference/anakin/convert/test_fc_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..7b8ceefe28873f0ffb9cedbb04b832ba029b7de4 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/fc.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +TEST(fc_op, test) { + auto fc_converter = OpRegister::instance()->Get("fc"); + ASSERT_TRUE(fc_converter != nullptr); + // Registrar register_fc("fc"); + // auto fc = std::make_shared(); + + std::unordered_set parameters({"mul_y"}); + framework::Scope scope; + AnakinConvertValidation validator(parameters, scope); + validator.DeclInputVar("mul_x", {1, 1, 1, 1}); + validator.DeclParamVar("mul_y", {1, 2}); + validator.DeclOutputVar("mul_out", {1, 1, 1, 2}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("mul"); + desc.SetInput("X", {"mul_x"}); + desc.SetInput("Y", {"mul_y"}); + desc.SetOutput("Out", {"mul_out"}); + int num_flatten_dims = 3; + desc.SetAttr("x_num_col_dims", num_flatten_dims); + validator.SetOp(*desc.Proto()); + + validator.Execute(10); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(mul); diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..38d8e596a738ac98c9f9870473f72dcc72b0e7aa --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/ut_helper.h @@ -0,0 +1,202 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/inference/anakin/engine.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/platform/enforce.h" + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::Precision; +using anakin::saber::NV; +using anakin::saber::X86; +using anakin::saber::Shape; +using anakin::PBlock; +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +/* + * Get a random float value between [low, high] + */ +float random(float low, float high) { + static std::random_device rd; + static std::mt19937 mt(rd()); + std::uniform_real_distribution dist(low, high); + return dist(mt); +} + +void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place, + const platform::DeviceContext& ctx) { + auto dims = tensor->dims(); + size_t num_elements = analysis::AccuDims(dims, dims.size()); + PADDLE_ENFORCE_GT(num_elements, 0); + + platform::CPUPlace cpu_place; + framework::LoDTensor temp_tensor; + temp_tensor.Resize(dims); + auto* temp_data = temp_tensor.mutable_data(cpu_place); + + for (size_t i = 0; i < num_elements; i++) { + *(temp_data + i) = random(0., 1.); + } + + TensorCopySync(temp_tensor, place, tensor); +} + +/* + * Help to validate the correctness between Fluid Op and the corresponding + * anakin + * layer. + */ +class AnakinConvertValidation { + using AnakinNvEngineT = AnakinEngine; + + public: + AnakinConvertValidation() = delete; + + AnakinConvertValidation(const std::unordered_set& parameters, + const framework::Scope& scope) + : parameters_(parameters), scope_(scope), place_(0) { + PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); + engine_.reset(new AnakinEngine(true)); + } + + // Declare a Variable as input with random initialization. + void DeclInputVar(const std::string& name, + const std::vector tensor_dims) { + DeclVar(name, tensor_dims); + // should decalre anakin input here. + } + + void DeclParamVar(const std::string& name, const std::vector dim_vec) { + DeclVar(name, dim_vec); + } + + void DeclOutputVar(const std::string& name, const std::vector dim_vec) { + DeclVar(name, dim_vec); + // should declare anakin output here. + } + + void DeclVar(const std::string& name, const std::vector dim_vec) { + platform::CUDADeviceContext ctx(place_); + auto* x = scope_.Var(name); + auto* x_tensor = x->GetMutable(); + x_tensor->Resize(framework::make_ddim(dim_vec)); + RandomizeTensor(x_tensor, place_, ctx); + } + + void SetOp(const framework::proto::OpDesc& desc) { + op_ = framework::OpRegistry::CreateOp(desc); + op_desc_.reset(new framework::OpDesc(desc, nullptr)); + // should init anakin engine here. + + Singleton::Global().ConvertOp( + desc, parameters_, scope_, engine_.get(), true /*test_mode*/); + engine_->Freeze(); + for (const auto& input : op_desc_->InputArgumentNames()) { + if (parameters_.count(input)) continue; + auto& t = inference::analysis::GetFromScope(scope_, + input); + auto t_shape = framework::vectorize2int(t.dims()); + engine_->SetInputShape(input, t_shape); + } + engine_->Optimize(); + engine_->InitGraph(); + } + + // We use the set 'neglected_output' here, because some Ops like batch norm, + // the outputs specified in the op des are only used during training, + // so we should neglect those output during inference. + void Execute(int batch_size, + std::unordered_set neglected_output = {}) { + // Execute Fluid Op + platform::CUDADeviceContext ctx(place_); + op_->Run(scope_, place_); + + // std::vector input_vector; + // std::vector output_vector; + std::map inputs; + for (const auto& input : op_desc_->InputArgumentNames()) { + if (parameters_.count(input)) continue; + auto* var = scope_.FindVar(input); + auto tensor = var->GetMutable(); + inputs.insert({input, tensor}); + } + + std::map outputs; + std::vector> fluid_outputs; + for (const auto& output : op_desc_->OutputArgumentNames()) { + if (neglected_output.count(output)) continue; + std::vector fluid_out; + auto* var = scope_.FindVar(output); + auto tensor = var->GetMutable(); + framework::TensorToVector(*tensor, ctx, &fluid_out); + fluid_outputs.push_back(fluid_out); + + // size_t fluid_out_size = fluid_out.size(); + /*for (size_t i = 0; i < fluid_out_size; i++) { + std::cout << fluid_out[i] << std::endl; + }*/ + outputs.insert({output, tensor}); + } + + engine_->Execute(inputs, outputs); + int i_output = 0; + for (const auto& output : op_desc_->OutputArgumentNames()) { + if (neglected_output.count(output)) continue; + std::vector anakin_out; + auto* var = scope_.FindVar(output); + auto tensor = var->GetMutable(); + framework::TensorToVector(*tensor, ctx, &anakin_out); + + size_t anakin_out_size = anakin_out.size(); + auto fluid_out = fluid_outputs[i_output++]; + for (size_t i = 0; i < anakin_out_size; i++) { + LOG(INFO) << "Output[" << i << "]: anakin[" << anakin_out[i] << "], " + << "fluid[" << fluid_out[i] << "]"; + } + } + } + + framework::Scope& scope() { return scope_; } + + private: + std::unique_ptr engine_{nullptr}; + cudaStream_t stream_; + std::unique_ptr op_; + std::unique_ptr op_desc_; + const std::unordered_set& parameters_; + framework::Scope& scope_; + platform::CUDAPlace place_; +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc new file mode 100644 index 0000000000000000000000000000000000000000..6549991474f4834f0c3ef74c60d294cca6bebc91 --- /dev/null +++ b/paddle/fluid/inference/anakin/engine.cc @@ -0,0 +1,112 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/engine.h" +#include +#include +#include +#include +#include "paddle/fluid/framework/ddim.h" + +using anakin::Precision; +using anakin::OpRunType; +using paddle::framework::LoDTensor; +template +using AnakinNetT = anakin::Net; + +template +using AnakinGraphT = anakin::graph::Graph; + +namespace paddle { +namespace inference { +namespace anakin { + +template +AnakinEngine::AnakinEngine(bool need_summary) + : graph_(new AnakinGraphT()), + net_(new AnakinNetT(need_summary)) {} + +template +AnakinEngine::~AnakinEngine() {} + +template +void AnakinEngine::SetInputShape( + const std::string &name, std::vector shape) { + graph_->AddOpAttr<::anakin::PTuple>(name, "input_shape", + std::move(shape)); +} + +template +void AnakinEngine::InitGraph() { + net_->init(*graph_); +} + +template +void AnakinEngine::AddOp( + const std::string &name, const std::string &type, + const std::vector &inputs, + const std::vector &outputs) { + PADDLE_ENFORCE(graph_->AddOp(name, type, inputs, outputs), "Add operation."); +} + +template +void AnakinEngine::Execute( + const std::map &inputs, + const std::map &outputs) { + for (const auto &input : inputs) { + auto *tensor = input.second; + auto *data = tensor->data(); + auto shape = framework::vectorize2int(tensor->dims()); + ::anakin::saber::Shape anakin_shape(shape); + auto *anakin_input = net_->get_in(input.first); + ::anakin::saber::Tensor tmp_anakin_tensor(data, TargetT(), 0, + anakin_shape); + anakin_input->share_from(tmp_anakin_tensor); + } + + for (const auto &output : outputs) { + auto *tensor = output.second; + auto *data = tensor->data(); + auto shape = framework::vectorize2int(tensor->dims()); + ::anakin::saber::Shape anakin_shape(shape); + auto *anakin_output = net_->get_out(output.first); + ::anakin::saber::Tensor tmp_anakin_tensor(data, TargetT(), 0, + anakin_shape); + anakin_output->share_from(tmp_anakin_tensor); + } + net_->prediction(); +} + +template +void AnakinEngine::Freeze() { + PADDLE_ENFORCE(graph_->Freeze(), "Freeze anakin subgraph."); +} + +template +void AnakinEngine::Optimize() { + PADDLE_ENFORCE(graph_->Optimize(), "Graph optimization."); +} + +template +std::unique_ptr> +AnakinEngine::Clone() { + auto *engine = new AnakinEngine(); + engine->net_ = std::move(net_->Clone()); + return std::unique_ptr(engine); +} + +template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::FP32>; +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h new file mode 100644 index 0000000000000000000000000000000000000000..d8f32f57be5aabb91ba720c6457a03f15083db43 --- /dev/null +++ b/paddle/fluid/inference/anakin/engine.h @@ -0,0 +1,80 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/inference/engine.h" +#include "paddle/fluid/inference/utils/singleton.h" + +#include "framework/core/net/net.h" +#include "framework/core/types.h" +#include "framework/graph/graph.h" +#include "saber/saber_types.h" + +namespace anakin { + +template +class Net; + +namespace graph { +template +class Graph; +} // namespace graph +} // namespace anakin + +namespace paddle { +namespace inference { +namespace anakin { + +template +class AnakinEngine { + public: + explicit AnakinEngine(bool need_summary = false); + ~AnakinEngine(); + void InitGraph(); + void SetInputShape(const std::string &name, std::vector shape); + void AddOp(const std::string &name, const std::string &type, + const std::vector &inputs, + const std::vector &outputs); + + template + void AddOpAttr(const std::string &op_name, const std::string &attr_name, + const T &attr_value) { + PADDLE_ENFORCE(graph_->AddOpAttr(op_name, attr_name, attr_value), + "Add operation's attribution."); + } + + std::unique_ptr Clone(); + void Freeze(); + void Optimize(); + void Execute(const std::map &inputs, + const std::map &outputs); + + private: + using NetT = ::anakin::Net; + using GraphT = ::anakin::graph::Graph; + std::unique_ptr graph_; + std::unique_ptr net_; +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/test_anakin_engine.cc b/paddle/fluid/inference/anakin/test_anakin_engine.cc new file mode 100644 index 0000000000000000000000000000000000000000..571294d3e22fb9489686bfcb2f3a64198099f970 --- /dev/null +++ b/paddle/fluid/inference/anakin/test_anakin_engine.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include + +#include "framework/core/net/net.h" +#include "framework/graph/graph.h" +#include "framework/graph/graph_global_mem.h" +#include "paddle/fluid/inference/anakin/engine.h" + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::Precision; +using anakin::saber::NV; +using anakin::saber::X86; +using anakin::saber::Shape; +using anakin::PBlock; +using anakin::PTuple; +namespace paddle { +namespace inference { +namespace anakin { + +class TestAnakinEngine : public ::testing::Test { + protected: + void SetUp() override; + void TearDown() override {} + + protected: + using AnakinNvEngineT = AnakinEngine; + std::unique_ptr engine_{nullptr}; +}; + +void TestAnakinEngine::SetUp() { + engine_.reset(new AnakinEngine(true)); +} + +TEST_F(TestAnakinEngine, Execute) { + engine_->AddOp("op1", "Dense", {"x"}, {"y"}); + engine_->AddOpAttr("op1", "out_dim", 2); + engine_->AddOpAttr("op1", "bias_term", false); + engine_->AddOpAttr("op1", "axis", 1); + std::vector shape = {1, 1, 1, 2}; + Shape tmp_shape(shape); + // PBlock weight1(tmp_shape); + auto *weight1 = + GraphGlobalMem::Global().template new_block(tmp_shape); + // auto *weight1 = new PBlock(tmp_shape, AK_FLOAT); + + float *cpu_data = static_cast(weight1->h_tensor().mutable_data()); + cpu_data[0] = 2.; + weight1->d_tensor().set_shape(tmp_shape); + weight1->d_tensor().copy_from(weight1->h_tensor()); + engine_->AddOpAttr("op1", "weight_1", *weight1); + + engine_->Freeze(); + // PTuple input_shape = {1}; + // engine_->AddOpAttr("x", "input_shape", input_shape); + engine_->SetInputShape("x", {1, 1, 1, 1}); + engine_->Optimize(); + engine_->InitGraph(); + framework::LoDTensor x; + framework::LoDTensor y; + x.Resize({1, 1, 1, 1}); + y.Resize({1, 1, 1, 2}); + auto *x_data = x.mutable_data(platform::CUDAPlace()); + float x_data_cpu[] = {1.}; + cudaMemcpy(x_data, x_data_cpu, sizeof(float), cudaMemcpyHostToDevice); + + std::map inputs = {{"x", &x}}; + auto *y_data = y.mutable_data(platform::CUDAPlace()); + std::map outputs = {{"y", &y}}; + + engine_->Execute(inputs, outputs); + auto *y_data_gpu = y_data; + float y_data_cpu[2]; + cudaMemcpy(y_data_cpu, y_data_gpu, sizeof(float) * 2, cudaMemcpyDeviceToHost); + LOG(INFO) << "output value: " << y_data_cpu[0] << ", " << y_data_cpu[1]; +} + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 2f31b182af7293488719e41a92b2ea78709bda02..89e934ae27b9319d4e1d2d51586d5f8fa7dccfce 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -23,8 +23,12 @@ #pragma once +#include #include +#include +#include #include + #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" @@ -133,6 +137,8 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode, AnalysisConfig::Precision); + DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine, + bool); // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 59107f28080dceb0a58e17d42281db5f3773de56..a48058400241b030f17557156a4d973fca92fd8d 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -17,10 +17,12 @@ limitations under the License. */ #include #include #include +#include #include #include #include #include +#include #include #include "paddle/fluid/framework/framework.pb.h" @@ -217,6 +219,35 @@ static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir, return ""; } +static std::string GetTrtEngineSerializedPath(const std::string &model_root, + const std::string &engine_key) { + return model_root + "/trt_serialized_" + engine_key; +} + +static std::string GetTrtEngineSerializedData( + const std::string &model_opt_cache_dir, const std::string &engine_key) { + std::string trt_serialized_path = + GetTrtEngineSerializedPath(model_opt_cache_dir, engine_key); + if (FileExists(trt_serialized_path)) { + VLOG(3) << "Trt serialized file: " << trt_serialized_path + << "is found here"; + std::ifstream infile(trt_serialized_path, std::ios::in); + std::stringstream buffer; + buffer << infile.rdbuf(); + std::string trt_engine_serialized_data(buffer.str()); + return trt_engine_serialized_data; + } + return ""; +} + +static void SaveTrtEngineSerializedDataToFile( + const std::string &trt_serialized_path, + const std::string &engine_serialized_data) { + std::ofstream outfile(trt_serialized_path); + outfile << engine_serialized_data; + outfile.close(); +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 8d5ee36ae627deccd7ddbd4bf8c5354a82c5e9db..1cdb4881fbc1e2c0249430f7148bf56261bd6c41 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -81,6 +81,9 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set( "model_opt_cache_dir", new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); + pass->Set("gpu_device_id", new int(argument->gpu_device_id())); + pass->Set("use_static_engine", + new bool(argument->tensorrt_use_static_engine())); } pre_pass = pass_name; diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h index 2a595cb36b8345157b3fd26afc62aabfa98b87bc..2d120679eedd392d78b4da66276297ff7280792b 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.h +++ b/paddle/fluid/inference/analysis/ir_pass_manager.h @@ -22,7 +22,10 @@ #pragma once +#include #include +#include +#include #include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc index a64f85ee9ac1a7bb8f0ed7bb8678166bbbcd5746..96befe7f8a5d16402338ac337daa96d714b4d310 100644 --- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc @@ -460,77 +460,6 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) { return node.inputs.size() == n; } -NodesTSIterator::NodesTSIterator(const std::vector &source) { - PADDLE_ENFORCE(!source.empty(), - "Start points of topological sorting should not be empty!"); - // CHECK all the inputs' in-degree is 0 - for (auto *node : source) { - PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0)); - } - - std::unordered_set visited; - std::unordered_set to_visit{source.begin(), source.end()}; - - std::vector inlink_visited; - while (!to_visit.empty()) { - std::vector queue(to_visit.begin(), to_visit.end()); - for (auto *p : queue) { - if (Agent(p).deleted()) { - visited.insert(p); - to_visit.erase(p); - } - - inlink_visited.clear(); - - std::copy_if(p->inputs.begin(), p->inputs.end(), - std::back_inserter(inlink_visited), - [&](Node *x) -> bool { return visited.count(x) != 0; }); - - if (inlink_visited.size() == p->inputs.size()) { - sorted_.push_back(p); - for (auto *_ : p->outputs) { - if (!visited.count(_)) { - to_visit.insert(_); - } - } - - to_visit.erase(p); - visited.insert(p); - } - } - } -} - -NodesTSIterator::NodesTSIterator(const NodesTSIterator &other) - : sorted_(other.sorted_), cursor_(other.cursor_) {} - -Node &NodesTSIterator::operator*() { - PADDLE_ENFORCE_LT(cursor_, sorted_.size()); - return *sorted_[cursor_]; -} - -NodesTSIterator &NodesTSIterator::operator++() { - if (++cursor_ >= sorted_.size()) { - sorted_.clear(); - cursor_ = 0; - } - return *this; -} -NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) { - cursor_ = other.cursor_; - sorted_ = other.sorted_; - return *this; -} - -bool NodesTSIterator::operator==(const NodesTSIterator &other) { - return sorted_ == other.sorted_ && cursor_ == other.cursor_; -} - -Node *NodesTSIterator::operator->() { - PADDLE_ENFORCE_LT(cursor_, sorted_.size()); - return sorted_[cursor_]; -} - } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h index ea88edd042aa9d46f66af1aa92f2cb273696c118..5d11c217b69f11d45c6fb6d552dc404fa8313daf 100644 --- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h @@ -30,6 +30,7 @@ namespace inference { namespace analysis { using framework::ir::Graph; +using framework::ir::NodesTSIterator; const char kIsFunctionNode[] = "__is_function_node__"; const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__"; @@ -132,32 +133,6 @@ struct Agent { framework::ir::Node *x_; }; -// Topological sorting iterator on nodes. -struct NodesTSIterator - : public std::iterator { - NodesTSIterator() = default; - explicit NodesTSIterator(const std::vector &source); - NodesTSIterator(NodesTSIterator &&other) - : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) { - other.cursor_ = 0; - } - NodesTSIterator(const NodesTSIterator &other); - - framework::ir::Node &operator*(); - NodesTSIterator &operator++(); - // TODO(Superjomn) current implementation just compare the first - // element, need to compare the graph and all the elements in the queue and - // set. - NodesTSIterator &operator=(const NodesTSIterator &other); - bool operator==(const NodesTSIterator &other); - bool operator!=(const NodesTSIterator &other) { return !(*this == other); } - framework::ir::Node *operator->(); - - private: - std::vector sorted_; - size_t cursor_{0}; -}; - // The nodes those have no input will be treated as start points. static std::vector ExtractStartPoints(const Graph &g) { std::vector result; diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 69a9caec030600332c9f11ba255e4e642bd41e96..d4e2da8957f2057b21460d00b71e9717c63ed054 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -14,13 +14,13 @@ #include #include -#include -#include #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/op_teller.h" #include "paddle/fluid/string/pretty_log.h" @@ -33,8 +33,15 @@ using framework::ir::Node; std::vector ExtractParameters( const std::unordered_set &nodes); -std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( +void RenameAndGetOutputs( + const std::vector &subgraph_nodes, + framework::BlockDesc *block_desc, + const std::set &input_names_with_id, + std::set *output_names_with_id, + std::set *output_names, + std::unordered_map *output_name_map); +std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( std::unique_ptr graph) const { framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); @@ -47,9 +54,16 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( Get("min_subgraph_size") /*min subgraph size*/); fuser(); + std::vector graph_param_names = + ExtractParameters(graph->Nodes()); + // those parameter already exist in trt, and should not have another copy in + // fluid. + std::vector repetitive_params; + for (auto *node : graph->Nodes()) { if (node->IsOp() && !Agent(node).subgraph()->empty()) { - CreateTensorRTOp(node, graph.get()); + CreateTensorRTOp(node, graph.get(), graph_param_names, + &repetitive_params); std::unordered_set nodes2remove( Agent(node).subgraph()->begin(), Agent(node).subgraph()->end()); @@ -64,12 +78,15 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( } } framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); + graph->Set(framework::ir::kRepetitiveParamAttr, + new std::vector(repetitive_params)); return graph; } std::string GenerateEngineKey(const std::set &engine_inputs, - const std::set &engine_outputs) { + const std::set &engine_outputs, + const std::string &predictor_id) { std::string engine_hash_key = ""; for (auto name : engine_inputs) { engine_hash_key += name; @@ -77,12 +94,15 @@ std::string GenerateEngineKey(const std::set &engine_inputs, for (auto name : engine_outputs) { engine_hash_key += name; } + engine_hash_key += predictor_id; auto engine_key = std::to_string(std::hash()(engine_hash_key)); return engine_key; } -void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, - Graph *graph) const { +void TensorRtSubgraphPass::CreateTensorRTOp( + framework::ir::Node *node, Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const { auto *op_desc = node->Op(); auto &subgraph = *Agent(node).subgraph(); PADDLE_ENFORCE(!subgraph.empty()); @@ -116,12 +136,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, // is unique. std::set input_names; std::set input_names_with_id; + std::vector params; + + // The node->inputs containes input tensors and parameters. for (auto *x : node->inputs) { input_names.insert(x->Name()); input_names_with_id.insert(x->Name() + std::to_string(x->id())); + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) { + params.push_back(x->Name()); + } } - op_desc->SetInput( - "Xs", std::vector(input_names.begin(), input_names.end())); std::set output_names; std::set output_names_with_id; @@ -130,11 +154,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, output_names_with_id.insert(x->Name() + std::to_string(x->id())); } - op_desc->SetOutput( - "Ys", std::vector(output_names.begin(), output_names.end())); - op_desc->SetType("tensorrt_engine"); - std::unordered_map output_name_map; + auto &subgraph_nodes = *Agent(node).subgraph(); // The following procedure is used to rename all the intermediate // variables and the output variables of the subgraph. @@ -148,61 +169,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, // input of a OP, but also the output of a Op, there will be problems. // So we have to rename the variable in the subgraph to make sure // it is either an OP's input or an OP's output. - - auto &subgraph_nodes = *Agent(node).subgraph(); - for (size_t index = 0; index < block_desc.OpSize(); ++index) { - framework::proto::OpDesc *op = block_desc.Op(index)->Proto(); - auto correspond_node = subgraph_nodes[index]; - PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); - - std::unordered_map var2id; - for (auto *in_var : correspond_node->inputs) { - var2id[in_var->Name()] = in_var->id(); - } - // rename for the input variables of op inside subgraph - for (int i = 0; i < op->inputs_size(); i++) { - // one input - auto *in_var = op->mutable_inputs(i); - std::vector replaced_names; - for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments - std::string arg_value = in_var->arguments(k); - std::string arg_value_with_id = - arg_value + std::to_string(var2id[arg_value]); - if (input_names_with_id.count(arg_value_with_id)) { - replaced_names.push_back(arg_value); - } else { - replaced_names.push_back(arg_value_with_id); - } - } - in_var->clear_arguments(); - for (size_t k = 0; k < replaced_names.size(); k++) { - in_var->add_arguments(replaced_names[k]); - } - } - var2id.clear(); - for (auto out_var : correspond_node->outputs) { - var2id[out_var->Name()] = out_var->id(); - } - - // rename for the output variables of op inside subgraph - for (int i = 0; i < op->outputs_size(); i++) { - framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); - std::vector replaced_names; - for (int k = 0; k < out_var->arguments_size(); k++) { - std::string arg_value = out_var->arguments(k); - std::string arg_value_with_id = - arg_value + std::to_string(var2id[arg_value]); - if (output_names_with_id.count(arg_value_with_id)) { - output_name_map[arg_value] = arg_value_with_id; - } - replaced_names.push_back(arg_value_with_id); - } - out_var->clear_arguments(); - for (size_t k = 0; k < replaced_names.size(); k++) { - out_var->add_arguments(replaced_names[k]); - } - } - } + RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id, + &output_names_with_id, &output_names, &output_name_map); // When tensorrt engine runs at the end of the operation, // output_mapping help us copy the data from the renamed ITensor @@ -212,6 +180,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, PADDLE_ENFORCE(output_name_map.count(name) != 0); output_mapping.push_back(output_name_map[name]); } + PADDLE_ENFORCE(!output_mapping.empty()); auto *vars = block_desc.Proto()->mutable_vars(); for (framework::ir::Node *node : graph->Nodes()) { @@ -222,26 +191,83 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), "the block has no var-desc"); - PADDLE_ENFORCE(!output_mapping.empty()); + + // Set attrs + op_desc->SetType("tensorrt_engine"); + op_desc->SetInput( + "Xs", std::vector(input_names.begin(), input_names.end())); + + op_desc->SetOutput( + "Ys", std::vector(output_names.begin(), output_names.end())); + op_desc->SetBlockAttr("sub_block", new_block); SetAttr(op_desc->Proto(), "subgraph", block_desc.Proto()->SerializeAsString()); - // Set attrs SetAttr(op_desc->Proto(), "max_batch_size", Get("max_batch_size")); SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); - SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); + SetAttr(op_desc->Proto(), "parameters", params); auto enable_int8 = Get("enable_int8"); - auto engine_key = - GenerateEngineKey(input_names_with_id, output_names_with_id); + auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id, + std::to_string(0)); + // Get "" when there is no cached calibration table data. std::string calibration_data = GetTrtCalibTableData( Get("model_opt_cache_dir"), engine_key, enable_int8); SetAttr(op_desc->Proto(), "calibration_data", calibration_data); SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "engine_key", engine_key); + SetAttr(op_desc->Proto(), "engine_serialized_data", std::string("")); + + std::unique_ptr calibrator; + if (enable_int8 && calibration_data.size() != 0) { + calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data)); + } + + bool use_static_engine = Get("use_static_engine"); + // When in int8 mode and calibration_mode, the program just produce the + // calibration table data. + bool calibration_mode = (enable_int8 && calibration_data.size() == 0); + if (!calibration_mode && use_static_engine) { + std::copy(params.begin(), params.end(), + std::back_inserter(*repetitive_params)); + std::string trt_engine_serialized_data = GetTrtEngineSerializedData( + Get("model_opt_cache_dir"), engine_key); + + if (trt_engine_serialized_data.empty()) { + LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " + "kernel etc). This process may cost a lot of time."; + std::unique_ptr trt_engine( + new tensorrt::TensorRTEngine( + Get("max_batch_size"), Get("workspace_size"), + enable_int8, calibrator.get(), Get("gpu_device_id"))); + auto *scope = param_scope(); + framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); + std::unordered_set param_set(params.begin(), params.end()); + inference::Singleton::Global() + .ConvertBlockToTRTEngine( + &block_desc_temp, *scope, + std::vector(input_names.begin(), input_names.end()), + param_set, output_mapping, trt_engine.get()); + nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize(); + trt_engine_serialized_data = + std::string((const char *)serialized_engine_data->data(), + serialized_engine_data->size()); + SaveTrtEngineSerializedDataToFile( + GetTrtEngineSerializedPath(Get("model_opt_cache_dir"), + engine_key), + trt_engine_serialized_data); + } else { + LOG(INFO) << "Load TRT Optimized Info from " + << GetTrtEngineSerializedPath( + Get("model_opt_cache_dir"), engine_key); + } + + SetAttr(op_desc->Proto(), "engine_serialized_data", + trt_engine_serialized_data); + } } std::vector ExtractParameters( @@ -253,7 +279,7 @@ std::vector ExtractParameters( for (const auto &node : nodes) { if (!node->IsOp()) continue; std::string op_type = node->Op()->Type(); - if (op_type == "feed") { + if (op_type == "feed" || op_type == "fetch") { std::vector output_names = node->Op()->OutputArgumentNames(); std::copy(output_names.begin(), output_names.end(), std::back_inserter(feed_outputs)); @@ -272,6 +298,99 @@ std::vector ExtractParameters( return parameters; } +void RenameAndGetOutputs( + const std::vector &subgraph_nodes, + framework::BlockDesc *block_desc, + const std::set &input_names_with_id, + std::set *output_names_with_id, + std::set *output_names, + std::unordered_map *output_name_map) { + //// In the normal case, the paddle-trt exists bug when runing the googlenet. + // When there are more than two convolutions of 1 * 1 with the same input, the + // paddle-tensorrt will do the merging optimization, which fuse those conv + // into one conv, and then trigger bug. So, We should use strategy to avoid + // this optimization for the time being. This bug will be fixed in the future. + std::unordered_map + same_hierarchy_conv2d_num_map; + + for (size_t index = 0; index < block_desc->OpSize(); ++index) { + framework::proto::OpDesc *op = block_desc->Op(index)->Proto(); + framework::OpDesc op_desc(*op, nullptr); + auto correspond_node = subgraph_nodes[index]; + PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); + + std::unordered_map var2id; + std::unordered_map in_vars; + for (auto *in_var : correspond_node->inputs) { + var2id[in_var->Name()] = in_var->id(); + in_vars[in_var->Name()] = in_var; + } + // rename for the input variables of op inside subgraph + for (int i = 0; i < op->inputs_size(); i++) { + // one input + auto *in_var = op->mutable_inputs(i); + std::vector replaced_names; + for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments + std::string arg_value = in_var->arguments(k); + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (input_names_with_id.count(arg_value_with_id)) { + replaced_names.push_back(arg_value); + } else { + replaced_names.push_back(arg_value_with_id); + } + } + in_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + in_var->add_arguments(replaced_names[k]); + } + } + var2id.clear(); + for (auto out_var : correspond_node->outputs) { + var2id[out_var->Name()] = out_var->id(); + } + + if (op_desc.Type() == "conv2d") { + auto input_var_name = op_desc.Input("Input").front(); + auto filter_var_name = op_desc.Input("Filter").front(); + auto out_var_name = op_desc.Output("Output").front(); + auto filter_shape = in_vars[filter_var_name]->Var()->GetShape(); + const std::vector strides = + boost::get>(op_desc.GetAttr("strides")); + const std::vector paddings = + boost::get>(op_desc.GetAttr("paddings")); + if (same_hierarchy_conv2d_num_map[input_var_name] > 0) { + (*output_names_with_id) + .insert(out_var_name + std::to_string(var2id[out_var_name])); + (*output_names).insert(out_var_name); + } else if (filter_shape[2] == 1 && filter_shape[3] == 1 && + strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 && + paddings[1] == 0) { + same_hierarchy_conv2d_num_map[input_var_name] += 1; + } + } + + // rename for the output variables of op inside subgraph + for (int i = 0; i < op->outputs_size(); i++) { + framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); + std::vector replaced_names; + for (int k = 0; k < out_var->arguments_size(); k++) { + std::string arg_value = out_var->arguments(k); + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (output_names_with_id->count(arg_value_with_id)) { + (*output_name_map)[arg_value] = arg_value_with_id; + } + replaced_names.push_back(arg_value_with_id); + } + out_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + out_var->add_arguments(replaced_names[k]); + } + } + } +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h index 502353b95fc15e763900a0caf1649257508f0880..6689a668fc9313df4105875477424f1426637226 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h @@ -13,7 +13,12 @@ // limitations under the License. #pragma once -#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" namespace paddle { @@ -26,8 +31,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase { std::unique_ptr graph) const override; private: - void CreateTensorRTOp(framework::ir::Node *x, - framework::ir::Graph *graph) const; + void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const; void CleanIntermediateOutputs(framework::ir::Node *node); }; diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 8be2d3ac0b105e50fe619a720929dedaacb75537..d13ec7608c3e8075c1ef62fd4d47fbeee06e9005 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { // The parameters are on the cpu, therefore, synchronization is not necessary. if (!argument->use_gpu()) return; + auto &graph = argument->main_graph(); + std::vector repetitive_params; + + if (graph.Has(framework::ir::kRepetitiveParamAttr)) + repetitive_params = graph.Get>( + framework::ir::kRepetitiveParamAttr); + LOG(INFO) << "Sync params from CPU to GPU"; PADDLE_ENFORCE(argument->gpu_device_id_valid()); @@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { // Because there exists the case that new parameter variables are not added to // the program in the analysis pass. for (auto &var_name : all_vars) { + if (std::count(repetitive_params.begin(), repetitive_params.end(), + var_name)) { + continue; + } auto *var = scope->FindLocalVar(var_name); PADDLE_ENFORCE(var != nullptr); if (var->IsType() || diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h index a95f460df6f9636fc17a5cf76920f5f459385120..61990150a30db147418c4301359428cf3c6db541 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -17,6 +17,7 @@ #include #include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index e92273b4dd94f11e0e90c91fd82dafe42bf158f3..77411112220dcb722d4d3482bc844720981a2da2 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -89,7 +89,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(params_file_); CP_MEMBER(model_from_memory_); // the memory model reuses prog_file_ and // params_file_ fields. - // Gpu releated. + // Gpu related. CP_MEMBER(use_gpu_); CP_MEMBER(device_id_); CP_MEMBER(memory_pool_init_size_mb_); @@ -97,13 +97,14 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(enable_memory_optim_); CP_MEMBER(static_memory_optim_); CP_MEMBER(static_memory_optim_force_update_); - // TensorRT releated. + // TensorRT related. CP_MEMBER(use_tensorrt_); CP_MEMBER(tensorrt_workspace_size_); CP_MEMBER(tensorrt_max_batchsize_); CP_MEMBER(tensorrt_min_subgraph_size_); CP_MEMBER(tensorrt_precision_mode_); - // MKLDNN releated. + CP_MEMBER(trt_use_static_engine_); + // MKLDNN related. CP_MEMBER(use_mkldnn_); CP_MEMBER(mkldnn_enabled_op_types_); @@ -144,7 +145,7 @@ void AnalysisConfig::EnableMKLDNN() { void AnalysisConfig::EnableTensorRtEngine( int workspace_size, int max_batch_size, int min_subgraph_size, - AnalysisConfig::Precision precision_mode) { + AnalysisConfig::Precision precision_mode, bool use_static) { #ifdef PADDLE_WITH_CUDA if (!use_gpu()) { LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; @@ -156,6 +157,7 @@ void AnalysisConfig::EnableTensorRtEngine( tensorrt_max_batchsize_ = max_batch_size; tensorrt_min_subgraph_size_ = min_subgraph_size; tensorrt_precision_mode_ = precision_mode; + trt_use_static_engine_ = use_static; Update(); #else diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 712e010db4340bde55945f89c488ba8cc38a1926..b58c60e96a0bd6695b827e7063fa7a07f42fe586 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -183,6 +183,9 @@ void AnalysisPredictor::SetMkldnnThreadID(int tid) { bool AnalysisPredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { + if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) { + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); + } VLOG(3) << "Predictor::predict"; inference::Timer timer; timer.tic(); @@ -240,6 +243,8 @@ bool AnalysisPredictor::SetFeed(const std::vector &inputs, input_ptr = input.mutable_data(ddim, place_); } else if (inputs[i].dtype == PaddleDType::FLOAT32) { input_ptr = input.mutable_data(ddim, place_); + } else if (inputs[i].dtype == PaddleDType::INT32) { + input_ptr = input.mutable_data(ddim, place_); } else { LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; return false; @@ -323,8 +328,11 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, } else if (type == framework::proto::VarType::INT64) { GetFetchOne(fetch, output); output->dtype = PaddleDType::INT64; + } else if (type == framework::proto::VarType::INT32) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::INT32; } else { - LOG(ERROR) << "unknown type, only support float32 and int64 now."; + LOG(ERROR) << "unknown type, only support float32, int64 and int32 now."; } } return true; @@ -362,6 +370,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); + argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_); } if (config_.use_mkldnn_) { @@ -392,7 +401,7 @@ std::unique_ptr CreatePaddlePredictor< AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { VLOG(3) << "create AnalysisConfig"; if (config.use_gpu()) { - // 1. GPU memeroy + // 1. GPU memory PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f); PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d", config.gpu_device_id()); @@ -435,12 +444,14 @@ void AnalysisPredictor::PrepareFeedFetch() { } feeds_[idx] = op; feed_names_[op->Output("Out")[0]] = idx; + idx2feeds_[idx] = op->Output("Out")[0]; } else if (op->Type() == "fetch") { int idx = boost::get(op->GetAttr("col")); if (fetches_.size() <= static_cast(idx)) { fetches_.resize(idx + 1); } fetches_[idx] = op; + idx2fetches_[idx] = op->Input("X")[0]; } } } @@ -453,6 +464,22 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) { var->GetMutable(); } +std::vector AnalysisPredictor::GetInputNames() { + std::vector input_names; + for (auto &item : idx2feeds_) { + input_names.push_back(item.second); + } + return input_names; +} + +std::vector AnalysisPredictor::GetOutputNames() { + std::vector output_names; + for (auto &item : idx2fetches_) { + output_names.push_back(item.second); + } + return output_names; +} + std::unique_ptr AnalysisPredictor::GetInputTensor( const std::string &name) { PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); @@ -460,6 +487,13 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( new ZeroCopyTensor(static_cast(executor_->scope()))); res->input_or_output_ = true; res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = boost::get(place_); + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } + return res; } @@ -470,6 +504,12 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( new ZeroCopyTensor(static_cast(executor_->scope()))); res->input_or_output_ = false; res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = boost::get(place_); + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } return res; } @@ -726,7 +766,7 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { return need; } -std::string AnalysisPredictor::GetSeriazlizedProgram() const { +std::string AnalysisPredictor::GetSerializedProgram() const { return inference_program_->Proto()->SerializeAsString(); } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 014df4ee8b6d86232212736c43a9aff32ffee011..5c0535d63e00c32ef82aa6d804459542d7da3e50 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -15,12 +15,14 @@ #pragma once #include #include +#include #include #include #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/details/reset_tensor_array.h" +#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/string/printf.h" #ifdef PADDLE_WITH_TESTING @@ -53,6 +55,9 @@ class AnalysisPredictor : public PaddlePredictor { std::vector *output_data, int batch_size = -1) override; + std::vector GetInputNames(); + std::vector GetOutputNames(); + std::unique_ptr GetInputTensor( const std::string &name) override; std::unique_ptr GetOutputTensor( @@ -74,7 +79,7 @@ class AnalysisPredictor : public PaddlePredictor { void SetMkldnnThreadID(int tid); - std::string GetSeriazlizedProgram() const override; + std::string GetSerializedProgram() const override; protected: // For memory optimization. @@ -131,7 +136,11 @@ class AnalysisPredictor : public PaddlePredictor { std::shared_ptr inference_program_; std::vector feeds_; std::map feed_names_; + // Sorted according to the idx. + std::map idx2feeds_; std::vector fetches_; + std::map idx2fetches_; + // Memory buffer for feed inputs. The temporary LoDTensor will cause serious // concurrency problems, wrong results and memory leak, so cache them. std::vector feed_tensors_; diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 002ba90e40e69d565f5a54e374a3f0083b84273f..6696839b53fb21c274843afd86b5d8b5c2042c51 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -214,8 +214,8 @@ TEST(AnalysisPredictor, memory_optim) { { // The first predictor help to cache the memory optimize strategy. auto predictor = CreatePaddlePredictor(config); - LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram(); - ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty()); + LOG(INFO) << "serialized program: " << predictor->GetSerializedProgram(); + ASSERT_FALSE(predictor->GetSerializedProgram().empty()); // Run several times to check the parameters are not reused by mistake. for (int i = 0; i < 5; i++) { diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index 6cd18277d63200f5bccf180a7ae3196b0ce126ff..7d57b6ec74468dbdb0519f85140629a0ac01c18d 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -28,6 +28,8 @@ int PaddleDtypeSize(PaddleDType dtype) { return sizeof(float); case PaddleDType::INT64: return sizeof(int64_t); + case PaddleDType::INT32: + return sizeof(int32_t); default: assert(false); return -1; @@ -92,7 +94,7 @@ void PaddleBuf::Reset(void *data, size_t length) { void PaddleBuf::Free() { if (memory_owned_ && data_) { - PADDLE_ENFORCE_GT(length_, 0); + PADDLE_ENFORCE_GT(length_, 0UL); free(static_cast(data_)); data_ = nullptr; length_ = 0; diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index e18bc02d92eb517fa20dc83811694b8ac80ae316..54f40563c3662af24e794422be4d3262d86c76a7 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -131,6 +131,9 @@ NativePaddlePredictor::~NativePaddlePredictor() { bool NativePaddlePredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { + if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) { + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); + } VLOG(3) << "Predictor::predict"; Timer timer; timer.tic(); @@ -200,6 +203,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, input_ptr = input.mutable_data(ddim, place_); } else if (inputs[i].dtype == PaddleDType::FLOAT32) { input_ptr = input.mutable_data(ddim, place_); + } else if (inputs[i].dtype == PaddleDType::INT32) { + input_ptr = input.mutable_data(ddim, place_); } else { LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; return false; @@ -278,8 +283,11 @@ bool NativePaddlePredictor::GetFetch(std::vector *outputs, } else if (type == framework::DataTypeTrait::DataType) { GetFetchOne(fetch, output); output->dtype = PaddleDType::INT64; + } else if (type == framework::DataTypeTrait::DataType) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::INT32; } else { - LOG(ERROR) << "unknown type, only support float32 and int64 now."; + LOG(ERROR) << "unknown type, only support float32, int64 and int32 now."; } } return true; @@ -290,7 +298,7 @@ std::unique_ptr CreatePaddlePredictor< NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) { VLOG(3) << "create NativePaddlePredictor"; if (config.use_gpu) { - // 1. GPU memeroy + // 1. GPU memory PADDLE_ENFORCE_GE( config.fraction_of_gpu_memory, 0.f, "fraction_of_gpu_memory in the config should be set to range (0., 1.]"); diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index e82cb53bf073d3d1ab9a518218edaf430728463f..2dc5dda34d02c6df9c0ccbc47a1ac960e1aca3f5 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -42,6 +42,9 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { } else if (t->type() == framework::proto::VarType::FP32) { pt.data.Reset(t->data(), t->numel() * sizeof(float)); pt.dtype = PaddleDType::FLOAT32; + } else if (t->type() == framework::proto::VarType::INT32) { + pt.data.Reset(t->data(), t->numel() * sizeof(int32_t)); + pt.dtype = PaddleDType::INT32; } else { LOG(FATAL) << "unsupported type."; } diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h index d70c6aea791219a40c3164b51499f9d5e562be71..1505a898c5bba285b377203c1503b8615666b196 100644 --- a/paddle/fluid/inference/api/demo_ci/utils.h +++ b/paddle/fluid/inference/api/demo_ci/utils.h @@ -88,13 +88,20 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) { } break; } - case PaddleDType::FLOAT32: + case PaddleDType::FLOAT32: { for (size_t i = 0; i < numel; ++i) { CHECK_LT( fabs(static_cast(output.data.data())[i] - refer.data[i]), 1e-5); } break; + } + case PaddleDType::INT32: { + for (size_t i = 0; i < numel; ++i) { + CHECK_EQ(static_cast(output.data.data())[i], refer.data[i]); + } + break; + } } } @@ -113,11 +120,18 @@ static std::string SummaryTensor(const PaddleTensor& tensor) { } break; } - case PaddleDType::FLOAT32: + case PaddleDType::FLOAT32: { for (int i = 0; i < std::min(num_elems, 10); i++) { ss << static_cast(tensor.data.data())[i] << " "; } break; + } + case PaddleDType::INT32: { + for (int i = 0; i < std::min(num_elems, 10); i++) { + ss << static_cast(tensor.data.data())[i] << " "; + } + break; + } } return ss.str(); } diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index f60ff40c5da3e9e03c2cb3583263394cb82db805..9a40cf4b60a64c3d0452a4367ccb7ac36de6b3b8 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -73,12 +74,72 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const { return res; } +template +void ZeroCopyTensor::copy_from_cpu(const T *data) { + EAGER_GET_TENSOR; + PADDLE_ENFORCE_GE( + tensor->numel(), 0, + "You should call ZeroCopyTensor::Reshape(const std::vector &shape)" + "function before copy data from cpu."); + size_t ele_size = tensor->numel() * sizeof(T); + + if (place_ == PaddlePlace::kCPU) { + auto *t_data = tensor->mutable_data(platform::CPUPlace()); + std::memcpy(static_cast(t_data), data, ele_size); + } else { +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + platform::CUDAPlace gpu_place(device_); + auto *t_data = tensor->mutable_data(gpu_place); + auto *dev_ctx = + static_cast(pool.Get(gpu_place)); + + memory::Copy(gpu_place, static_cast(t_data), platform::CPUPlace(), + data, ele_size, dev_ctx->stream()); +#else + PADDLE_THROW("Not compile with CUDA, should not reach here."); +#endif + } +} + +template +void ZeroCopyTensor::copy_to_cpu(T *data) { + EAGER_GET_TENSOR; + auto ele_num = tensor->numel(); + auto *t_data = tensor->data(); + auto t_place = tensor->place(); + + if (platform::is_cpu_place(t_place)) { + std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); + } else { +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto gpu_place = boost::get(t_place); + auto *dev_ctx = + static_cast(pool.Get(gpu_place)); + memory::Copy(platform::CPUPlace(), static_cast(data), gpu_place, + t_data, ele_num * sizeof(T), dev_ctx->stream()); +#else + PADDLE_THROW("Not compile with CUDA, should not reach here."); +#endif + } +} +template void ZeroCopyTensor::copy_from_cpu(const float *data); +template void ZeroCopyTensor::copy_from_cpu(const int64_t *data); +template void ZeroCopyTensor::copy_from_cpu(const int32_t *data); +template void ZeroCopyTensor::copy_to_cpu(float *data); +template void ZeroCopyTensor::copy_to_cpu(int64_t *data); +template void ZeroCopyTensor::copy_to_cpu(int32_t *data); + template float *ZeroCopyTensor::data(PaddlePlace *place, int *size) const; template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size) const; +template int32_t *ZeroCopyTensor::data(PaddlePlace *place, + int *size) const; template float *ZeroCopyTensor::mutable_data(PaddlePlace place); template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int32_t *ZeroCopyTensor::mutable_data(PaddlePlace place); void *ZeroCopyTensor::FindTensor() const { PADDLE_ENFORCE(!name_.empty(), @@ -92,10 +153,10 @@ void *ZeroCopyTensor::FindTensor() const { return tensor; } -std::vector ZeroCopyTensor::shape() const { +std::vector ZeroCopyTensor::shape() const { EAGER_GET_TENSOR; PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_); - return framework::vectorize(tensor->dims()); + return framework::vectorize2int(tensor->dims()); } void ZeroCopyTensor::SetLoD(const std::vector> &x) { diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc index 12071e09f8442f2c52a06b7c3fe4bed2c28b524a..cbbb3ea2d1395acdf4c460bea4b7868c31a20e53 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc @@ -37,7 +37,7 @@ template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); void *ZeroCopyTensor::FindTensor() const { return nullptr; } -std::vector ZeroCopyTensor::shape() const { return {}; } +std::vector ZeroCopyTensor::shape() const { return {}; } void ZeroCopyTensor::SetLoD(const std::vector> &x) {} diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index b92781e4f2c612cbb39fcaa7c80b6051a67215fd..258a79fa4e884177490fab79778151ae52537aa0 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -50,6 +50,11 @@ class Timer { } }; +static int GetUniqueId() { + static int id = 0; + return id++; +} + static void split(const std::string &str, char sep, std::vector *pieces) { pieces->clear(); @@ -81,6 +86,13 @@ static void split_to_int64(const std::string &str, char sep, std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is), [](const std::string &v) { return std::stoi(v); }); } +static void split_to_int(const std::string &str, char sep, + std::vector *is) { + std::vector pieces; + split(str, sep, &pieces); + std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is), + [](const std::string &v) { return std::stoi(v); }); +} template std::string to_string(const std::vector &vec) { std::stringstream ss; @@ -127,9 +139,8 @@ static void TensorAssignData(PaddleTensor *tensor, } template -static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, - const std::vector> &data) { - int size{0}; +static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, + const std::vector> &data) { auto *ptr = tensor->mutable_data(PaddlePlace::kCPU); int c = 0; for (const auto &f : data) { @@ -137,7 +148,15 @@ static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, ptr[c++] = v; } } - return size; +} + +template +static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, + const PaddleBuf &data) { + auto *ptr = tensor->mutable_data(PaddlePlace::kCPU); + for (size_t i = 0; i < data.length() / sizeof(T); i++) { + ptr[i] = *(reinterpret_cast(data.data()) + i); + } } static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) { @@ -197,6 +216,9 @@ static std::string DescribeTensor(const PaddleTensor &tensor, case PaddleDType::INT64: os << "int64"; break; + case PaddleDType::INT32: + os << "int32"; + break; default: os << "unset"; } diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 47361b3279e14dd65a0e6e7f864e508ef1183045..9b05c335047d7f9a0c50004e4ff6817ddd53d80f 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -135,7 +135,8 @@ struct AnalysisConfig { */ void EnableTensorRtEngine(int workspace_size = 1 << 20, int max_batch_size = 1, int min_subgraph_size = 3, - Precision precision = Precision::kFloat32); + Precision precision = Precision::kFloat32, + bool use_static = true); /** A boolean state telling whether the TensorRT engine is used. */ bool tensorrt_engine_enabled() const { return use_tensorrt_; } @@ -212,12 +213,12 @@ struct AnalysisConfig { std::string prog_file_; std::string params_file_; - // GPU releated. + // GPU related. bool use_gpu_{false}; int device_id_{0}; uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. - // TensorRT releated. + // TensorRT related. bool use_tensorrt_{false}; // For workspace_size, refer it from here: // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting @@ -233,6 +234,7 @@ struct AnalysisConfig { // subgraph, 3 as default value. int tensorrt_min_subgraph_size_{3}; Precision tensorrt_precision_mode_; + bool trt_use_static_engine_; // memory reuse related. bool enable_memory_optim_{false}; diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index f90a74b9102ee62d15c2d738b53971c9bde51439..703fd18069474f28b29c6f16c6308fc19bd3527f 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -36,6 +36,7 @@ namespace paddle { enum PaddleDType { FLOAT32, INT64, + INT32, // TODO(Superjomn) support more data types if needed. }; @@ -160,11 +161,21 @@ class ZeroCopyTensor { template T* data(PaddlePlace* place, int* size) const; - std::vector shape() const; + template + void copy_from_cpu(const T* data); + + template + void copy_to_cpu(T* data); + + std::vector shape() const; void SetLoD(const std::vector>& x); std::vector> lod() const; const std::string& name() const { return name_; } + void SetPlace(PaddlePlace place, int device = -1) { + place_ = place; + device_ = device; + } protected: explicit ZeroCopyTensor(void* scope) : scope_{scope} {} @@ -179,6 +190,8 @@ class ZeroCopyTensor { // The corresponding tensor pointer inside Paddle workspace is cached for // performance. mutable void* tensor_{nullptr}; + PaddlePlace place_; + int device_; }; /** A simple Inference API for Paddle. @@ -200,6 +213,14 @@ class PaddlePredictor { std::vector* output_data, int batch_size = -1) = 0; + /** \brief Get input names of the model + */ + virtual std::vector GetInputNames() { return {}; } + + /** \brief Get output names of the model + */ + virtual std::vector GetOutputNames() { return {}; } + /** \brief Get a mutable tensor directly. * * NOTE Only works in AnalysisPredictor. @@ -248,7 +269,7 @@ class PaddlePredictor { /** \brief Get the serialized model program that executes in inference phase. * Its data type is ProgramDesc, which is a protobuf message. */ - virtual std::string GetSeriazlizedProgram() const { + virtual std::string GetSerializedProgram() const { assert(false); // Force raise error. return "NotImplemented"; } diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index f9c13c2fa84b3b5d629297d3f44a6f5889a734f4..92c24647e87a096e7cfbbf69876b678fe48842a4 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/inference/api/paddle_pass_builder.h" - +#ifdef PADDLE_WITH_CUDA +#include +#endif #include namespace paddle { diff --git a/paddle/fluid/inference/engine.h b/paddle/fluid/inference/engine.h index ce2b8161715a3fa2278ce950dbac82c6d0042bef..1a13ba510384c010e476bf0ba0ad5b0ba84d3240 100644 --- a/paddle/fluid/inference/engine.h +++ b/paddle/fluid/inference/engine.h @@ -49,11 +49,6 @@ class EngineBase { // Execute the engine, that will run the inference network. virtual void Execute(int batch_size) = 0; - // Return the IO buffer that allocated in engine. One can read/write directly - // on the buffer. If the buffer's buffer is nullptr, one can also allocate - // memory and maintain it outside the engine. - virtual Buffer& buffer(const std::string& name) = 0; - virtual ~EngineBase() {} }; // class EngineBase diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 7900f56c9ce17ffc7c62c85a42c62ba326dea16e..39a99a21ea702032669ed4ed3016ab34128c9925 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -18,21 +18,6 @@ namespace paddle { namespace inference { namespace tensorrt { -bool to_skip_merging_optimize(TensorRTEngine* engine, - const std::vector& filters, - const std::vector& strides, - const std::vector& paddings, - std::string input_name) { - if (engine->itensor_quote_num[input_name] > 0) { - return true; - } - if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 && - strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0) - engine->itensor_quote_num[input_name] += 1; - - return false; -} - template void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode, @@ -59,7 +44,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, weight_tensor->Resize(Y_t->dims()); TensorCopySync((*Y_t), cpu_place, weight_tensor.get()); - auto* weight_data = weight_tensor->mutable_data(platform::CPUPlace()); + auto* weight_data = weight_tensor->mutable_data(cpu_place); PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); const int n_output = weight_tensor->dims()[0]; @@ -100,9 +85,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, layer->getOutput(0)->setName(output_name.c_str()); engine->SetITensor(output_name, layer->getOutput(0)); - if (test_mode || - to_skip_merging_optimize(engine, {filter_h, filter_w}, strides, paddings, - op_desc.Input("Input").front())) { + if (test_mode) { engine->DeclareOutput(output_name); } } diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 79362f9677010247dffa4fbaa155a7a56eed6f85..0c5a1a6ef16f05308df22452ed5e184e94e117d2 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -153,7 +153,6 @@ class ElementwiseTensorOpConverter : public OpConverter { if (CheckDims(dims_x, dims_y)) { // The two input tensor should have the same dims VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer"; - nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER( engine_, ElementWise, *const_cast(X), *const_cast(Y), op_pair->second); @@ -166,7 +165,7 @@ class ElementwiseTensorOpConverter : public OpConverter { "ElementWisePluginLayer"; plugin::ElementWisePlugin* plugin = - new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis); + new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis); plugin->AddInput(X); plugin->AddInput(Y); nvinfer1::IPluginLayer* layer = engine_->AddPlugin( diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index eef4fab4e86f05fa80bc614371f1aa43e433407e..42dcd68e40e04e775961fd943070f3df2f28d99a 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -85,10 +85,10 @@ class FcOpConverter : public OpConverter { Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float)); TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), - Y_t->memory_size() / sizeof(float)}; + static_cast(Y_t->numel())}; TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT, static_cast(tmp->data()), - Y_t->memory_size() / sizeof(float)); + static_cast(Y_t->numel())); weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]}); tmp_weight.dims = weight.dims; diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 91670ba8ac5332fe6e83b7bff14cb1a349d7e2a2..90ed90b1e2907cc4be6f507890bae8df5a44ee38 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -16,9 +16,12 @@ limitations under the License. */ #include #include +#include +#include #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -26,6 +29,37 @@ namespace paddle { namespace inference { namespace tensorrt { +using FluidDT = framework::proto::VarType_Type; +using TRT_DT = nvinfer1::DataType; + +namespace { // NOLINT + +TRT_DT FluidDataType2TRT(FluidDT type) { + switch (type) { + case FluidDT::VarType_Type_FP32: + return TRT_DT::kFLOAT; + case FluidDT::VarType_Type_INT32: + return TRT_DT::kINT32; + default: + return TRT_DT::kINT32; + } + PADDLE_THROW("unkown type"); + return TRT_DT::kINT32; +} + +nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape) { + PADDLE_ENFORCE_GT(shape.size(), 1UL, + "TensorRT' tensor input requires at least 2 dimensions"); + PADDLE_ENFORCE_LE(shape.size(), 4UL, + "TensorRT' tensor input requires at most 4 dimensions"); + PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL); + if (shape.size() == 4UL) + return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]); + return nvinfer1::DimsCHW(shape[1], 1, 1); +} + +} // namespace // NOLINT + /* * Convert Op from Fluid to TensorRT Engine. */ @@ -110,6 +144,34 @@ class OpConverter { } } + // The scope here should be inited with the parameter vars. + void ConvertBlockToTRTEngine( + framework::BlockDesc* block_desc, const framework::Scope& scope, + const std::vector& inputs, + const std::unordered_set& parameters, + const std::vector& outputs, TensorRTEngine* engine) { + engine->InitNetwork(); + for (auto& input : inputs) { + if (parameters.count(input)) continue; + auto* var = block_desc->FindVar(input); + PADDLE_ENFORCE(var, "no variable called %s", input); + PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, + "TensorRT engine only takes LoDTensor as input"); + auto var_shape = var->GetShape(); + + engine->DeclareInput( + input, FluidDataType2TRT( + var->Proto()->type().lod_tensor().tensor().data_type()), + Vec2TRT_Dims(var_shape)); + } + framework::proto::BlockDesc* block_proto = block_desc->Proto(); + ConvertBlock(*block_proto, parameters, scope, engine); + for (auto& output : outputs) { + engine->DeclareOutput(output); + } + engine->FreezeNetwork(); + } + void SetEngine(TensorRTEngine* engine) { engine_ = engine; } virtual ~OpConverter() {} diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc index dbdff85ddebc85bc51938a204a48affe485b8240..2ae804106e5f7b51fc43e33cad986619e6a57d74 100644 --- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc @@ -43,23 +43,20 @@ class PReluOpConverter : public OpConverter { PADDLE_ENFORCE_NOT_NULL(alpha_var); auto* alpha_tensor = alpha_var->GetMutable(); - platform::CUDAPlace place; - std::unique_ptr alpha_tensor_device( + platform::CPUPlace cpu_place; + std::unique_ptr alpha_tensor_temp( new framework::LoDTensor()); - alpha_tensor_device->Resize(alpha_tensor->dims()); - TensorCopySync(*alpha_tensor, place, alpha_tensor_device.get()); - float* alpha_data = alpha_tensor_device->mutable_data(place); + alpha_tensor_temp->Resize(alpha_tensor->dims()); + TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get()); + float* alpha_data = alpha_tensor_temp->mutable_data(cpu_place); - // Transform alpha to TensorRTEngine::Weight - TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT, - static_cast(alpha_data), - alpha_tensor_device->numel()); - plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode); + plugin::PReluPlugin* plugin = + new plugin::PReluPlugin(alpha_data, alpha_tensor_temp->numel(), mode); nvinfer1::IPluginLayer* layer = engine_->AddPlugin(&input, input_num, plugin); // keep alpha tensor to avoid release it's memory engine_->weight_map[op_desc.Input("Alpha")[0]] = - std::move(alpha_tensor_device); + std::move(alpha_tensor_temp); std::string layer_name = "prelu (Output: "; auto output_name = op_desc.Output("Out")[0]; diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index e83961f3d7bda03a7659f175c59105dcb60708e9..2571abbf69892dae626c7178609c2825775fdf2e 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -19,7 +19,9 @@ limitations under the License. */ #pragma once +#include #include +#include #include #include "paddle/fluid/framework/lod_tensor.h" @@ -79,7 +81,8 @@ class TRTConvertValidation { if_add_batch_(if_add_batch), max_batch_size_(max_batch_size) { PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); - engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_)); + engine_.reset( + new TensorRTEngine(max_batch_size, workspace_size, false, nullptr, 0)); engine_->InitNetwork(); } @@ -114,13 +117,12 @@ class TRTConvertValidation { } void DeclVar(const std::string& name, const std::vector dim_vec) { - platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + platform::CUDADeviceContext ctx(place_); auto* x = scope_.Var(name); auto* x_tensor = x->GetMutable(); x_tensor->Resize(framework::make_ddim(dim_vec)); - RandomizeTensor(x_tensor, place, ctx); + RandomizeTensor(x_tensor, place_, ctx); } // Declare a variable in a fluid Scope. void DeclVar(const std::string& name, const nvinfer1::Dims& dims, @@ -146,19 +148,6 @@ class TRTConvertValidation { // Declare outputs. op_desc_.reset(new framework::OpDesc(desc, nullptr)); - - // Set Inputs. - for (const auto& input : op_desc_->InputArgumentNames()) { - if (parameters_.count(input)) continue; - auto* var = scope_.FindVar(input); - PADDLE_ENFORCE(var); - auto tensor = var->GetMutable(); - - engine_->SetInputFromGPU( - input, static_cast(tensor->data()), - sizeof(float) * - analysis::AccuDims(tensor->dims(), tensor->dims().size())); - } } // We use the set 'neglected_output' here, because some Ops like batch norm, @@ -168,43 +157,71 @@ class TRTConvertValidation { std::unordered_set neglected_output = {}) { // Execute Fluid Op PADDLE_ENFORCE_LE(batch_size, max_batch_size_); - platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); - op_->Run(scope_, place); - // Execute TRT. - engine_->Execute(batch_size); - cudaStreamSynchronize(engine_->stream()); + platform::CUDADeviceContext ctx(place_); + op_->Run(scope_, place_); - ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); - const size_t output_space_size = 3000; + std::vector input_output_names; + + // Note: we need filter the parameter + for (const auto& input : op_desc_->InputArgumentNames()) { + if (parameters_.count(input)) continue; + input_output_names.push_back(input); + } + + // Collect the fluid outputs. + std::vector> fluid_outs; for (const auto& output : op_desc_->OutputArgumentNames()) { if (neglected_output.count(output)) continue; + input_output_names.push_back(output); std::vector fluid_out; - std::vector trt_out(output_space_size); - engine_->GetOutputInCPU(output, &trt_out[0], output_space_size); - cudaStreamSynchronize(engine_->stream()); - auto* var = scope_.FindVar(output); - auto tensor = var->GetMutable(); + auto* tensor = var->GetMutable(); framework::TensorToVector(*tensor, ctx, &fluid_out); + fluid_outs.push_back(fluid_out); + } + + // Bind input and output for TRT. + const int num_bindings = input_output_names.size(); + std::vector buffers(num_bindings); + + for (const std::string& name : input_output_names) { + auto* var = scope_.FindVar(name); + auto* tensor = var->GetMutable(); + const int bind_index = engine_->engine()->getBindingIndex(name.c_str()); + buffers[bind_index] = + static_cast(tensor->mutable_data(place_)); + } + + // Execute TRT. + engine_->Execute(batch_size, &buffers, stream_); - size_t fluid_out_size = fluid_out.size(); + ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); + int index = 0; + for (const auto& output : op_desc_->OutputArgumentNames()) { + if (neglected_output.count(output)) continue; + std::vector trt_out; + auto* var = scope_.FindVar(output); + auto* tensor = var->GetMutable(); + framework::TensorToVector(*tensor, ctx, &trt_out); + + size_t fluid_out_size = fluid_outs[index].size(); if (if_add_batch_ == true) { fluid_out_size = batch_size * (framework::product(tensor->dims()) / max_batch_size_); } - // Compare two output - ASSERT_FALSE(fluid_out.empty()); + for (size_t i = 0; i < fluid_out_size; i++) { // Loose the threshold for CI in different machine model. - EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5); + EXPECT_LT(std::abs(fluid_outs[index][i] - trt_out[i]), 2e-5); } + index += 1; } } framework::Scope& scope() { return scope_; } private: + platform::CUDAPlace place_; std::unique_ptr engine_; cudaStream_t stream_; std::unique_ptr op_; diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 10f48462cfaf8073a4f5537d654d614d36b74db4..fddf5f11c285da4687b08d1962b6f1f51390e03e 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -32,36 +32,18 @@ void TensorRTEngine::Build(const DescType &paddle_model) { PADDLE_ENFORCE(false, "not implemented"); } -void TensorRTEngine::Execute(int batch_size) { +void TensorRTEngine::Execute(int batch_size, std::vector *buffers, + cudaStream_t stream) { freshDeviceId(); batch_size_ = batch_size; - std::vector buffers; - for (auto &buf : buffers_) { - PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated"); - PADDLE_ENFORCE_GT(buf.max_size, 0); - PADDLE_ENFORCE(buf.device == DeviceType::GPU); - buffers.push_back(buf.buffer); - } - infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr); - cudaStreamSynchronize(stream_); + infer_context_->enqueue(batch_size, buffers->data(), stream, nullptr); + cudaStreamSynchronize(stream); SetRuntimeBatch(batch_size); } -TensorRTEngine::~TensorRTEngine() { - cudaStreamSynchronize(stream_); - // clean buffer - for (auto &buf : buffers_) { - if (buf.device == DeviceType::GPU && buf.buffer != nullptr) { - PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer)); - buf.buffer = nullptr; - buf.max_size = 0; - } - } -} - void TensorRTEngine::FreezeNetwork() { - VLOG(3) << "TRT to freeze network"; freshDeviceId(); + VLOG(3) << "TRT to freeze network"; PADDLE_ENFORCE(infer_builder_ != nullptr, "Call InitNetwork first to initialize network."); PADDLE_ENFORCE(infer_network_ != nullptr, @@ -81,30 +63,6 @@ void TensorRTEngine::FreezeNetwork() { PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!"); infer_context_.reset(infer_engine_->createExecutionContext()); - - // allocate GPU buffers. - buffers_.resize(buffer_sizes_.size()); - for (auto &item : buffer_sizes_) { - // The output buffers are not set in the network building phrase, need to - // infer from the TesorRT network. - if (item.second == 0) { - auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str()); - auto dims = infer_engine_->getBindingDimensions(slot_offset); - item.second = kDataTypeSize[static_cast( - infer_engine_->getBindingDataType(slot_offset))] * - analysis::AccuDims(dims.d, dims.nbDims) * max_batch_; - PADDLE_ENFORCE_GT(item.second, 0); - } - - auto &buf = buffer(item.first); - buf.max_size = item.second * max_batch_; - CHECK(buf.buffer == nullptr); // buffer should be allocated only once. - - PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_)); - buf.size = 0; - PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G - buf.device = DeviceType::GPU; - } } nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, @@ -158,83 +116,6 @@ void TensorRTEngine::DeclareOutput(const std::string &name) { buffer_sizes_[name] = 0; } -void *TensorRTEngine::GetOutputInGPU(const std::string &name) { - return buffer(name).buffer; -} - -void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst, - size_t max_size) { - // determine data size - auto *output = TensorRTEngine::GetITensor(name); - nvinfer1::Dims dims = output->getDimensions(); - auto dim_size = analysis::AccuDims(dims.d, dims.nbDims); - size_t dst_size = dim_size * runtime_batch_ * - kDataTypeSize[static_cast(output->getType())]; - - auto it = buffer_sizes_.find(name); - PADDLE_ENFORCE(it != buffer_sizes_.end()); - PADDLE_ENFORCE_GT(it->second, 0); - PADDLE_ENFORCE_LE(dst_size, it->second); - PADDLE_ENFORCE_GE(max_size, dst_size); - auto &buf = buffer(name); - PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); - PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size, - cudaMemcpyDeviceToDevice, stream_), - 0); -} - -void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst, - size_t max_size) { - // determine data size - - auto *output = TensorRTEngine::GetITensor(name); - nvinfer1::Dims dims = output->getDimensions(); - auto dim_size = analysis::AccuDims(dims.d, dims.nbDims); - size_t dst_size = dim_size * runtime_batch_ * - kDataTypeSize[static_cast(output->getType())]; - auto it = buffer_sizes_.find(name); - PADDLE_ENFORCE(it != buffer_sizes_.end()); - PADDLE_ENFORCE_GT(it->second, 0); - PADDLE_ENFORCE_LE(dst_size, it->second); - PADDLE_ENFORCE_GE(max_size, dst_size); - auto &buf = buffer(name); - PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); - PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size, - cudaMemcpyDeviceToHost, stream_)); -} - -Buffer &TensorRTEngine::buffer(const std::string &name) { - PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first."); - auto it = buffer_sizes_.find(name); - PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s", - name); - auto slot_offset = infer_engine_->getBindingIndex(name.c_str()); - return buffers_[slot_offset]; -} - -void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data, - size_t size) { - auto &buf = buffer(name); - PADDLE_ENFORCE_NOT_NULL(buf.buffer); - PADDLE_ENFORCE_NOT_NULL(data); - PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); - PADDLE_ENFORCE(buf.device == DeviceType::GPU); - buf.size = size; - PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, - cudaMemcpyHostToDevice, stream_)); -} - -void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data, - size_t size) { - auto &buf = buffer(name); - buf.size = size; - PADDLE_ENFORCE_NOT_NULL(buf.buffer); - PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); - PADDLE_ENFORCE(buf.device == DeviceType::GPU); - PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, - cudaMemcpyDeviceToDevice, stream_)); -} - void TensorRTEngine::SetITensor(const std::string &name, nvinfer1::ITensor *tensor) { PADDLE_ENFORCE(tensor != nullptr); @@ -254,13 +135,6 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; } -void TensorRTEngine::freshDeviceId() { - int count; - cudaGetDeviceCount(&count); - PADDLE_ENFORCE_LT(device_, count); - cudaSetDevice(device_); -} - nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( nvinfer1::ITensor *const *inputs, int num_inputs, plugin::PluginTensorRT *plugin) { @@ -268,6 +142,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin); } +void TensorRTEngine::freshDeviceId() { + int count; + cudaGetDeviceCount(&count); + PADDLE_ENFORCE_LT(device_id_, count); + cudaSetDevice(device_id_); +} + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index cdfe09b5a7fd2d1f8548dab9421f671f5a345153..657dfd9355f9e3167a123b1f71655869d030a3df 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -37,7 +38,9 @@ class TRTInt8Calibrator; * There are two alternative ways to use it, one is to build from a paddle * protobuf model, another way is to manully construct the network. */ -class TensorRTEngine : public EngineBase { +class TensorRTEngine { + using DescType = ::paddle::framework::proto::BlockDesc; + public: // Weight is model parameter. class Weight { @@ -56,28 +59,28 @@ class TensorRTEngine : public EngineBase { nvinfer1::Weights w_; }; - TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream, - int device = 0, bool enable_int8 = false, - TRTInt8Calibrator* calibrator = nullptr, + TensorRTEngine(int max_batch, int max_workspace, bool enable_int8 = false, + TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), - stream_(stream), - device_(device), enable_int8_(enable_int8), calibrator_(calibrator), + device_id_(device_id), logger_(logger) {} - virtual ~TensorRTEngine(); + ~TensorRTEngine() {} // TODO(Superjomn) implement it later when graph segmentation is supported. - void Build(const DescType& paddle_model) override; + void Build(const DescType& paddle_model); - void Execute(int batch_size) override; + void Execute(int batch_size, std::vector* buffers, + cudaStream_t stream); // Initialize the inference network, so that TensorRT layers can add to this // network. void InitNetwork() { + freshDeviceId(); infer_builder_.reset(createInferBuilder(&logger_)); infer_network_.reset(infer_builder_->createNetwork()); } @@ -98,37 +101,34 @@ class TensorRTEngine : public EngineBase { // Check if the ITensor has been declared bool HasDeclared(const std::string& name); - // GPU memory address for an ITensor with specific name. One can operate on - // these memory directly for acceleration, for example, output the converted - // data directly to the buffer to save data copy overhead. - // NOTE this should be used after calling `FreezeNetwork`. - Buffer& buffer(const std::string& name) override; - - cudaStream_t stream() { return stream_; } - - // Fill an input from CPU memory with name and size. - void SetInputFromCPU(const std::string& name, const void* data, size_t size); - // TODO(Superjomn) is this method necessary given that buffer(xxx) can be - // accessed directly. Fill an input from GPU memory with name and size. - void SetInputFromGPU(const std::string& name, const void* data, size_t size); - // Get an output called name, the output of tensorrt is in GPU, so this method - // Return the output's GPU memory address without copy. - void* GetOutputInGPU(const std::string& name); - // Copy data into dst inside the GPU device. - void GetOutputInGPU(const std::string& name, void* dst, size_t max_size); - // LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU - // to CPU. - void GetOutputInCPU(const std::string& name, void* dst, size_t max_size); - // Fill an ITensor into map itensor_map_. void SetITensor(const std::string& name, nvinfer1::ITensor* tensor); // Get an ITensor called name. nvinfer1::ITensor* GetITensor(const std::string& name); nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } + + nvinfer1::IHostMemory* Serialize() { + PADDLE_ENFORCE(infer_engine_ != nullptr, + "You should build engine first and then serialize"); + ihost_memory_.reset(infer_engine_->serialize()); + return ihost_memory_.get(); + } + + void Deserialize(const std::string& engine_serialized_data) { + freshDeviceId(); + infer_ptr runtime(createInferRuntime(&logger_)); + infer_engine_.reset(runtime->deserializeCudaEngine( + engine_serialized_data.c_str(), engine_serialized_data.size(), + &inference::Singleton::Global())); + PADDLE_ENFORCE(infer_engine_ != nullptr, + "build cuda engine failed when deserialize engine info.!"); + infer_context_.reset(infer_engine_->createExecutionContext()); + } + void SetRuntimeBatch(size_t batch_size); int GetRuntimeBatch(); - int GetDevice() { return device_; } + int GetDeviceId() { return device_id_; } nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, int num_inputs, plugin::PluginTensorRT*); @@ -140,17 +140,12 @@ class TensorRTEngine : public EngineBase { std::unordered_map> weight_map; - // TODO(NHZLX) - // In the normal case, the paddle-trt exists bug when runing the googlenet. - // When there are more than two convolutions of 1 * 1 with the same input, the - // paddle-tensorrt will do the merging optimization, which fuse those conv - // into one conv, and then trigger bug. So, We should use strategy to avoid - // this - // optimization for the time being. This bug will be fixed in the future. - std::unordered_map - itensor_quote_num; - private: + // Each ICudaEngine object is bound to a specific GPU when it is instantiated, + // ensure that the thread is associated with the correct device by calling + // freshDeviceId(). + void freshDeviceId(); + // the max batch size int max_batch_; // the runtime batch size @@ -158,18 +153,14 @@ class TensorRTEngine : public EngineBase { // the max memory size the engine uses int max_workspace_; - cudaStream_t stream_; - // The specific GPU id that the TensorRTEngine bounded to. - int device_; - bool enable_int8_; TRTInt8Calibrator* calibrator_; // batch size of the current data, will be updated each Executation. int batch_size_{-1}; + int device_id_; nvinfer1::ILogger& logger_; - std::vector buffers_; // max data size for the buffers. std::unordered_map buffer_sizes_; std::unordered_map @@ -192,15 +183,11 @@ class TensorRTEngine : public EngineBase { infer_ptr infer_network_; infer_ptr infer_engine_; infer_ptr infer_context_; - // Each ICudaEngine object is bound to a specific GPU when it is instantiated, - // ensure that the thread is associated with the correct device by calling - // freshDeviceId(). - void freshDeviceId(); + infer_ptr ihost_memory_; }; // class TensorRTEngine // Add an layer__ into engine__ with args ARGS. // For example: -// TRT_ENGINE_ADD_LAYER(xxx, FullyConnected, input, dim, weights, bias) // // Reference // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index fc7ca7714e9325d2b6bce6189300aa339c81c2ba..010942a0678fe9a592d1a95ba9cdc6adc42cc2ec 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -17,6 +17,9 @@ #include #include #include +#include +#include +#include #include "paddle/fluid/platform/dynload/tensorrt.h" #include "paddle/fluid/platform/enforce.h" @@ -74,6 +77,32 @@ class NaiveLogger : public nvinfer1::ILogger { ~NaiveLogger() override {} }; +class NaiveProfiler : public nvinfer1::IProfiler { + public: + typedef std::pair Record; + std::vector mProfile; + + virtual void reportLayerTime(const char* layerName, float ms) { + auto record = + std::find_if(mProfile.begin(), mProfile.end(), + [&](const Record& r) { return r.first == layerName; }); + if (record == mProfile.end()) + mProfile.push_back(std::make_pair(layerName, ms)); + else + record->second += ms; + } + + void printLayerTimes() { + float totalTime = 0; + for (size_t i = 0; i < mProfile.size(); i++) { + printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), + mProfile[i].second); + totalTime += mProfile[i].second; + } + printf("Time over all layers: %4.3f\n", totalTime); + } +}; + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 95443e813327c1247ac530c4d2e68b3607ff0e73..709aa103d1b6681221328b180d65e90f08d3368e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1,4 +1,5 @@ nv_library(tensorrt_plugin - SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu + SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu + prelu_op_plugin.cu trt_plugin_factory.cc avg_pool_op_plugin.cu DEPS enforce tensorrt_engine prelu) diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu index 5d747af8c55d71fee90ee0cc06fd328e583f3700..f27a838162c89b6377a7ffd995608b3a5a49eeae 100644 --- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/operators/math/pooling.h" namespace paddle { @@ -20,6 +21,12 @@ namespace inference { namespace tensorrt { namespace plugin { +AvgPoolPlugin* CreateAvgPoolPluginDeserialize(const void* buffer, + size_t length) { + return new AvgPoolPlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("avg_pool_plugin", CreateAvgPoolPluginDeserialize); + nvinfer1::Dims AvgPoolPlugin::getOutputDimensions( int index, const nvinfer1::Dims* inputDims, int nbInputs) { assert(nbInputs == 1); diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h index b5e4ece0fba446627d619df6fe225e8c07231487..a7c0aa5794e6bb131d012cb12d6d9fc12a73bd0d 100644 --- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h @@ -33,24 +33,27 @@ class AvgPoolPlugin : public PluginTensorRT { protected: size_t getSerializationSize() override { - return SerializedSize(ceil_mode_) + SerializedSize(ksize_) + - SerializedSize(strides_) + SerializedSize(paddings_) + - SerializedSize(input_shape_) + getBaseSerializationSize(); + return SerializedSize(getPluginType()) + SerializedSize(ceil_mode_) + + SerializedSize(ksize_) + SerializedSize(strides_) + + SerializedSize(paddings_) + SerializedSize(input_shape_) + + SerializedSize(output_shape_) + getBaseSerializationSize(); } // TRT will call this func when we need to serialize the configuration of // tensorrt. - // It should not be called by users. void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); serializeBase(buffer); SerializeValue(&buffer, ceil_mode_); SerializeValue(&buffer, ksize_); SerializeValue(&buffer, strides_); SerializeValue(&buffer, paddings_); SerializeValue(&buffer, input_shape_); + SerializeValue(&buffer, output_shape_); } public: + AvgPoolPlugin() {} AvgPoolPlugin(bool ceil_mode, std::vector ksize, std::vector strides, std::vector paddings, std::vector input_shape) @@ -89,6 +92,7 @@ class AvgPoolPlugin : public PluginTensorRT { DeserializeValue(&serialData, &serialLength, &strides_); DeserializeValue(&serialData, &serialLength, &paddings_); DeserializeValue(&serialData, &serialLength, &input_shape_); + DeserializeValue(&serialData, &serialLength, &output_shape_); } AvgPoolPlugin *clone() const override { @@ -96,7 +100,7 @@ class AvgPoolPlugin : public PluginTensorRT { input_shape_); } - const char *getPluginType() const override { return "avg_pool"; } + const char *getPluginType() const override { return "avg_pool_plugin"; } int getNbOutputs() const override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, int nbInputDims) override; diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu index 9cd9026b7328083389b5af484bbb15c07b4908b0..9aed3ddab1448fde7cb6b0e13bcf0b05e23622e9 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu @@ -14,12 +14,19 @@ limitations under the License. */ #include #include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" namespace paddle { namespace inference { namespace tensorrt { namespace plugin { +ElementWisePlugin* CreateElementWisePluginDeserialize(const void* buffer, + size_t length) { + return new ElementWisePlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("elementwise_plugin", CreateElementWisePluginDeserialize); + namespace details { template @@ -119,10 +126,10 @@ int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs, const float* y = reinterpret_cast(inputs[1]); float* out = reinterpret_cast(outputs[0]); - if (type_ == nvinfer1::ElementWiseOperation::kSUM) { + if (type_ == "add") { details::ElementWise(details::Add(), x, y, out, batch_size, prev_size_, midd_size_, post_size_, stream); - } else if (type_ == nvinfer1::ElementWiseOperation::kPROD) { + } else if (type_ == "mul") { details::ElementWise(details::Mul(), x, y, out, batch_size, prev_size_, midd_size_, post_size_, stream); } else { diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h index 9c461f7a5c44ebb9d4a755288c69abff55e2dea8..3b040f14c531c540b8a855da85ecc3008224526c 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -24,9 +25,8 @@ namespace plugin { class ElementWisePlugin : public PluginTensorRT { public: - ElementWisePlugin(nvinfer1::ElementWiseOperation type, - nvinfer1::Dims const &dims_x, nvinfer1::Dims const &dims_y, - int axis) + ElementWisePlugin(std::string type, nvinfer1::Dims const &dims_x, + nvinfer1::Dims const &dims_y, int axis) : type_(type), dims_x_(dims_x), dims_y_(dims_y), @@ -37,6 +37,9 @@ class ElementWisePlugin : public PluginTensorRT { ElementWisePlugin(void const *serial_data, size_t serial_length) { deserializeBase(serial_data, serial_length); + const char *elementwise_type; + DeserializeValue(&serial_data, &serial_length, &elementwise_type); + type_ = std::string(elementwise_type); DeserializeValue(&serial_data, &serial_length, &axis_); DeserializeValue(&serial_data, &serial_length, &dims_x_); DeserializeValue(&serial_data, &serial_length, &dims_y_); @@ -47,7 +50,7 @@ class ElementWisePlugin : public PluginTensorRT { return nullptr; } - const char *getPluginType() const override { return "elementwise"; } + const char *getPluginType() const override { return "elementwise_plugin"; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *input_dims, @@ -61,18 +64,21 @@ class ElementWisePlugin : public PluginTensorRT { protected: size_t getSerializationSize() override { - return SerializedSize(axis_) + SerializedSize(dims_x_) + - SerializedSize(dims_y_) + getBaseSerializationSize(); + return SerializedSize(getPluginType()) + SerializedSize(axis_) + + SerializedSize(dims_x_) + SerializedSize(dims_y_) + + getBaseSerializationSize(); } void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); serializeBase(buffer); + SerializeValue(&buffer, type_.c_str()); SerializeValue(&buffer, axis_); SerializeValue(&buffer, dims_x_); SerializeValue(&buffer, dims_y_); } - nvinfer1::ElementWiseOperation type_; + std::string type_; nvinfer1::Dims dims_x_; nvinfer1::Dims dims_y_; int axis_; diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu index 3075e87ea6d719a3f49d14c8c4b8015f7d688a50..b8a044fe99b91893c8c9ef661b4f46ebaa6db8c7 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu @@ -17,6 +17,7 @@ #include #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/operators/math/prelu.h" namespace paddle { @@ -24,6 +25,17 @@ namespace inference { namespace tensorrt { namespace plugin { +PReluPlugin *CreatePreluPluginDeserialize(const void *buffer, size_t length) { + return new PReluPlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("prelu_plugin", CreatePreluPluginDeserialize); + +int PReluPlugin::initialize() { + cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size()); + cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float), + cudaMemcpyHostToDevice); +} + nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, const nvinfer1::Dims *inputDims, int nbInputs) { @@ -39,7 +51,8 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs, // input dims is CHW. const auto &input_dims = this->getInputDims(0); const float *input = reinterpret_cast(inputs[0]); - const float *alpha = reinterpret_cast(alpha_.get().values); + // const float *alpha = reinterpret_cast(alpha_.get().values); + const float *alpha = p_gpu_weight_; float *output = reinterpret_cast(outputs)[0]; std::vector input_shape; diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h index 0db56a310b072e64425f70ac23267ec72353e54b..a96649503f1c764e07370cb2b47b10f3dae72be4 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h @@ -14,7 +14,12 @@ #pragma once +#include #include +#include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" + #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -24,39 +29,51 @@ namespace tensorrt { namespace plugin { class PReluPlugin : public PluginTensorRT { - TensorRTEngine::Weight alpha_; + std::vector weight_; + float *p_gpu_weight_; std::string mode_; protected: size_t getSerializationSize() override { - // return getBaseSerializationSize(alpha_) + SerializedSize(mode_); - return 0; + return getBaseSerializationSize() + SerializedSize(mode_.c_str()) + + SerializedSize(weight_) + SerializedSize(getPluginType()); } // TRT will call this func when we need to serialize the configuration of // tensorrt. // It should not be called by users. void serialize(void *buffer) override { - // serializeBase(buffer); - // SerializeValue(&buffer, alpha_); - // SerializeValue(&buffer, mode_); + SerializeValue(&buffer, getPluginType()); + serializeBase(buffer); + SerializeValue(&buffer, weight_); + SerializeValue(&buffer, mode_.c_str()); } public: - PReluPlugin(TensorRTEngine::Weight const &alpha, std::string const &mode) - : alpha_(alpha), mode_(mode) {} + PReluPlugin(const float *weight, const int weight_num, + std::string const &mode) + : mode_(mode) { + weight_.resize(weight_num); + std::copy(weight, weight + weight_num, weight_.data()); + } // It was used for tensorrt deserialization. // It should not be called by users. PReluPlugin(void const *serialData, size_t serialLength) { - // deserializeBase(serialData, serialLength); - // DeserializeValue(&serialData, &serialLength, &alpha_); - // DeserializeValue(&serialData, &serialLength, &mode_); + deserializeBase(serialData, serialLength); + DeserializeValue(&serialData, &serialLength, &weight_); + const char *prelu_mode; + DeserializeValue(&serialData, &serialLength, &prelu_mode); + mode_ = std::string(prelu_mode); } + ~PReluPlugin() { cudaFree(p_gpu_weight_); } + int initialize() override; - PReluPlugin *clone() const override { return new PReluPlugin(alpha_, mode_); } + PReluPlugin *clone() const override { + return new PReluPlugin(weight_.data(), weight_.size(), mode_); + } - const char *getPluginType() const override { return "prelu"; } + const char *getPluginType() const override { return "prelu_plugin"; } int getNbOutputs() const override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, int nbInputDims) override; diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index de61ace59e299a1f51940e4b433a0133d4fbe7ff..b5503c3b95ee2429dd865fd6de416a04aafbccf0 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -15,12 +15,18 @@ #include #include #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" namespace paddle { namespace inference { namespace tensorrt { namespace plugin { +SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) { + return new SplitPlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize); + // copied from operators::math::SplitFunctor template __global__ void SplitKernel(const T* input_data, const int in_row, diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index 6f028d3d72ae3cc7d96c6782b734cdbf1243c06c..cbb72590567a35bee29387d4c00518b437913508 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -25,6 +26,7 @@ namespace plugin { class SplitPlugin : public PluginTensorRT { public: + SplitPlugin() {} SplitPlugin(int axis, std::vector const &output_lengths) : axis_(axis), same_shape_(true), output_length_(output_lengths) {} @@ -38,7 +40,7 @@ class SplitPlugin : public PluginTensorRT { return new SplitPlugin(axis_, output_length_); } - const char *getPluginType() const override { return "split"; } + const char *getPluginType() const override { return "split_plugin"; } int getNbOutputs() const override { return output_length_.size(); } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *input_dims, @@ -50,11 +52,12 @@ class SplitPlugin : public PluginTensorRT { protected: size_t getSerializationSize() override { - return SerializedSize(axis_) + SerializedSize(output_length_) + - getBaseSerializationSize(); + return SerializedSize(getPluginType()) + SerializedSize(axis_) + + SerializedSize(output_length_) + getBaseSerializationSize(); } void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); serializeBase(buffer); SerializeValue(&buffer, axis_); SerializeValue(&buffer, output_length_); diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h index 86084829e150f8a39610319a8f2138f2b2fdec68..3b737bd726ad09637f8530a114362d98d1dac1b0 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -17,9 +17,10 @@ #include #include #include +#include #include -#include "paddle/fluid/inference/tensorrt/plugin/serialize.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" @@ -30,6 +31,13 @@ namespace inference { namespace tensorrt { namespace plugin { +class PluginTensorRT; + +typedef std::function + PluginDeserializeFunc; + +typedef std::function PluginConstructFunc; + class PluginTensorRT : public nvinfer1::IPluginExt { public: PluginTensorRT() {} diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c20b6d1e725273dbfdc20c01fb01deea4e8d88e --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name, + const void* serial_data, + size_t serial_length) { + const char* plugin_type; + DeserializeValue(&serial_data, &serial_length, &plugin_type); + + PADDLE_ENFORCE(Has(plugin_type), + "trt plugin type %s does not exists, check it.", plugin_type); + auto plugin = plugin_registry_[plugin_type](serial_data, serial_length); + owned_plugins_.emplace_back(plugin); + + return plugin; +} + +bool PluginFactoryTensorRT::RegisterPlugin( + const std::string& op_name, PluginDeserializeFunc deserialize_func) { + if (Has(op_name)) return false; + auto ret = plugin_registry_.emplace(op_name, deserialize_func); + return ret.second; +} + +void PluginFactoryTensorRT::DestroyPlugins() { owned_plugins_.clear(); } + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h new file mode 100644 index 0000000000000000000000000000000000000000..139c75595f9f44cacf7d14cda6b1c8eb4ef3c0ee --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h @@ -0,0 +1,78 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h" +#include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class PluginFactoryTensorRT : public nvinfer1::IPluginFactory, + public DeleteHelper { + public: + // Deserialization method + PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data, + size_t serial_length) override; + + bool RegisterPlugin(const std::string& op_name, + PluginDeserializeFunc deserialize_func); + + bool Has(const std::string& op_name) { + return plugin_registry_.find(op_name) != plugin_registry_.end(); + } + + void DestroyPlugins(); + + protected: + std::unordered_map plugin_registry_; + + std::list> owned_plugins_; +}; + +class TrtPluginRegistrar { + public: + TrtPluginRegistrar(const std::string& name, + PluginDeserializeFunc deserialize_func) { + inference::Singleton::Global().RegisterPlugin( + name, deserialize_func); + } +}; + +#define REGISTER_TRT_PLUGIN(name, deserialize_func) \ + REGISTER_TRT_PLUGIN_UNIQ(__COUNTER__, name, deserialize_func) + +#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func) \ + static paddle::inference::tensorrt::plugin::TrtPluginRegistrar \ + trt_plugin_registrar##ctr __attribute__((unused)) = \ + paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \ + name, deserialize_func) + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/serialize.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h similarity index 96% rename from paddle/fluid/inference/tensorrt/plugin/serialize.h rename to paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h index ce859f16fc87479adf090687121ff06951b5684c..1cae4ccae4cc593785d9b3b0e87523e740eef4ff 100644 --- a/paddle/fluid/inference/tensorrt/plugin/serialize.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h @@ -13,8 +13,8 @@ // limitations under the License. #pragma once - #include +#include #include #include #include "paddle/fluid/platform/enforce.h" @@ -24,6 +24,13 @@ namespace inference { namespace tensorrt { namespace plugin { +// Some trt base classes lack of the destructor. +// We use a assisted class to fix this. +struct DeleteHelper { + protected: + virtual ~DeleteHelper() {} +}; + template inline void SerializeValue(void** buffer, T const& value); diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index 9eed0f6ee9ce4d9e35bec718dc8e8435921dbd81..a03dd45db0f80487cb4c2e6b68f94944e8558ae4 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -17,6 +17,8 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/platform/enforce.h" @@ -27,19 +29,34 @@ namespace tensorrt { class TensorRTEngineTest : public ::testing::Test { protected: void SetUp() override { - ASSERT_EQ(0, cudaStreamCreate(&stream_)); - engine_ = new TensorRTEngine(10, 1 << 10, stream_); + ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); + + engine_ = new TensorRTEngine(10, 1 << 10); engine_->InitNetwork(); } void TearDown() override { - delete engine_; - cudaStreamDestroy(stream_); + if (engine_) { + delete engine_; + engine_ = nullptr; + } + } + + void PrepareInputOutput(const std::vector &input, + std::vector output_shape) { + TensorFromVector(input, *ctx_, &input_); + output_.Resize(framework::make_ddim(output_shape)); + } + + void GetOutput(std::vector *output) { + TensorToVector(output_, *ctx_, output); } protected: - TensorRTEngine* engine_; - cudaStream_t stream_; + framework::Tensor input_; + framework::Tensor output_; + TensorRTEngine *engine_; + platform::CUDADeviceContext *ctx_; }; TEST_F(TensorRTEngineTest, add_layer) { @@ -48,12 +65,14 @@ TEST_F(TensorRTEngineTest, add_layer) { float raw_weight[size] = {2.}; // Weight in CPU memory. float raw_bias[size] = {3.}; + std::vector buffers(2); // TRT binded inputs + LOG(INFO) << "create weights"; TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, size); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, size); - auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW{1, 1, 1}); - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size, + auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size, weight.get(), bias.get()); PADDLE_ENFORCE(fc_layer != nullptr); @@ -63,18 +82,24 @@ TEST_F(TensorRTEngineTest, add_layer) { ASSERT_EQ(engine_->engine()->getNbBindings(), 2); // fill in real data - float x_v = 1234; - engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), - 1 * sizeof(float)); + std::vector x_v = {1234}; + std::vector y_cpu; + PrepareInputOutput(x_v, {1}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + LOG(INFO) << "to execute"; - engine_->Execute(1); + engine_->Execute(1, &buffers, ctx_->stream()); LOG(INFO) << "to get output"; - float y_cpu; - engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float)); + GetOutput(&y_cpu); LOG(INFO) << "to checkout output"; - ASSERT_EQ(y_cpu, x_v * 2 + 3); + ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3); } TEST_F(TensorRTEngineTest, add_layer_multi_dim) { @@ -83,12 +108,13 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { // instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]] float raw_weight[4] = {1.0, 1.1, 3.3, 4.4}; float raw_bias[2] = {1.3, 2.4}; + std::vector buffers(2); // TRT binded inputs TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 4); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 2); - auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW{1, 2, 1}); - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2, + auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2, weight.get(), bias.get()); PADDLE_ENFORCE(fc_layer != nullptr); @@ -96,19 +122,27 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { engine_->FreezeNetwork(); ASSERT_EQ(engine_->engine()->getNbBindings(), 2); - float x_v[2] = {1.0, 2.0}; - engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), - 2 * sizeof(float)); - engine_->Execute(1); + // fill in real data + std::vector x_v = {1.0, 2.0}; + std::vector y_cpu; + PrepareInputOutput(x_v, {2}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + + engine_->Execute(1, &buffers, ctx_->stream()); LOG(INFO) << "to get output"; - float y_cpu[2] = {-1., -1.}; + GetOutput(&y_cpu); auto dims = engine_->GetITensor("y")->getDimensions(); ASSERT_EQ(dims.nbDims, 3); ASSERT_EQ(dims.d[0], 2); ASSERT_EQ(dims.d[1], 1); - engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float)); + ASSERT_EQ(y_cpu[0], 4.5); ASSERT_EQ(y_cpu[1], 14.5); } @@ -117,12 +151,13 @@ TEST_F(TensorRTEngineTest, test_conv2d) { // Weight in CPU memory. float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; float raw_bias[1] = {0}; + std::vector buffers(2); // TRT binded inputs TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1); - auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3{1, 3, 3}); - auto* conv_layer = + auto *conv_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3}, weight.get(), bias.get()); PADDLE_ENFORCE(conv_layer != nullptr); @@ -133,28 +168,36 @@ TEST_F(TensorRTEngineTest, test_conv2d) { engine_->FreezeNetwork(); ASSERT_EQ(engine_->engine()->getNbBindings(), 2); - float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; - engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), - 18 * sizeof(float)); - engine_->Execute(2); + // fill in real data + std::vector x_v = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + std::vector y_cpu; + PrepareInputOutput(x_v, {18}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + + engine_->Execute(2, &buffers, ctx_->stream()); LOG(INFO) << "to get output"; - float* y_cpu = new float[18]; - engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float)); + GetOutput(&y_cpu); + ASSERT_EQ(y_cpu[0], 4.0); ASSERT_EQ(y_cpu[1], 6.0); } TEST_F(TensorRTEngineTest, test_pool2d) { // Weight in CPU memory. - auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3{1, 2, 2}); + std::vector buffers(2); // TRT binded inputs nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE; - auto* pool_layer = - TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast(x), - pool_t, nvinfer1::DimsHW{2, 2}); + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t, + nvinfer1::DimsHW{2, 2}); PADDLE_ENFORCE(pool_layer != nullptr); pool_layer->setStride(nvinfer1::DimsHW{1, 1}); @@ -164,14 +207,21 @@ TEST_F(TensorRTEngineTest, test_pool2d) { engine_->FreezeNetwork(); ASSERT_EQ(engine_->engine()->getNbBindings(), 2); - float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0}; - engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), - 8 * sizeof(float)); - engine_->Execute(2); + // fill in real data + std::vector x_v = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0}; + std::vector y_cpu; + PrepareInputOutput(x_v, {2}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + + engine_->Execute(2, &buffers, ctx_->stream()); LOG(INFO) << "to get output"; - float* y_cpu = new float[2]; - engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float)); + GetOutput(&y_cpu); ASSERT_EQ(y_cpu[0], 2.0); ASSERT_EQ(y_cpu[1], 5.0); diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 7ecd9e35332843e3a391cdad5ce32220d890abd1..8f7b6f31dec72a09c414654133dfe717606b0824 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -60,10 +60,13 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc) +# TODO(luotao, Superjom) Disable DAM test, temporarily fix +# https://github.com/PaddlePaddle/Paddle/issues/15032#issuecomment-455990914. +# After inference framework refactor, will reopen it. # normal DAM set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") -inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL) +#inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL) # small DAM set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam") @@ -102,6 +105,13 @@ set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1") download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc) +# transformer, the dataset only works on batch_size=8 now +set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer") +download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp%2Ftransformer_model.tar.gz" "temp%2Ftransformer_data.txt.tar.gz") +inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8) + # ocr set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") if (NOT EXISTS ${OCR_INSTALL_DIR}) diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc index 3f6c933f2bcc6ed5410cb95a48f5ee6869280fe4..5157bd280d0f3ee327d5cee7799477b5e6fd3f71 100644 --- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc @@ -107,6 +107,9 @@ void SetConfig(AnalysisConfig *cfg) { cfg->DisableGpu(); cfg->SwitchSpecifyInputNames(); cfg->SwitchIrOptim(); + if (FLAGS_zero_copy) { + cfg->SwitchUseFeedFetchOps(false); + } } void SetInput(std::vector> *inputs) { @@ -131,7 +134,7 @@ TEST(Analyzer_Pyramid_DNN, profile) { TestPrediction(reinterpret_cast(&cfg), input_slots_all, &outputs, FLAGS_num_threads); - if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) { PADDLE_ENFORCE_EQ(outputs.size(), 1UL); size_t size = GetSize(outputs[0]); PADDLE_ENFORCE_GT(size, 0); @@ -166,6 +169,19 @@ TEST(Analyzer_Pyramid_DNN, compare) { reinterpret_cast(&cfg), input_slots_all); } +// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy +TEST(Analyzer_Pyramid_DNN, compare_zero_copy) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + std::vector outputs_name; + outputs_name.emplace_back("cos_sim_2.tmp_0"); + CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), + input_slots_all, outputs_name); +} + // Compare Deterministic result TEST(Analyzer_Pyramid_DNN, compare_determine) { AnalysisConfig cfg; diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index c27c39f40a2067dd2bd2150e4b1e53eab7cdf06e..dcf4b38ce8a9230148738cfd0840ca96b0c7cf8c 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -207,6 +207,9 @@ void SetConfig(AnalysisConfig *cfg) { cfg->DisableGpu(); cfg->SwitchSpecifyInputNames(); cfg->SwitchIrOptim(); + if (FLAGS_zero_copy) { + cfg->SwitchUseFeedFetchOps(false); + } } void SetInput(std::vector> *inputs) { @@ -285,131 +288,17 @@ TEST(Analyzer_rnn1, multi_thread) { input_slots_all, &outputs, 2 /* multi_thread */); } -// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing -// on the complex RNN1 model. -TEST(Analyzer_rnn1, ZeroCopy) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - - PaddlePlace place; - - auto predictor = CreatePaddlePredictor(config); - - config.SwitchUseFeedFetchOps(true); - auto native_predictor = - CreatePaddlePredictor(config.ToNativeConfig()); - - config.SwitchUseFeedFetchOps( - true); // the analysis predictor needs feed/fetch. - auto analysis_predictor = CreatePaddlePredictor(config); - -#define NEW_TENSOR(name__) \ - auto name__##_tensor = predictor->GetInputTensor(#name__); - NEW_TENSOR(data_lod_attention); - NEW_TENSOR(cell_init); - NEW_TENSOR(data); - NEW_TENSOR(week); - NEW_TENSOR(minute); - NEW_TENSOR(hidden_init); - - // Prepare data for AnalysisPredictor - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(), - data_tensor.get(), hidden_init_tensor.get(), - week_tensor.get(), minute_tensor.get(), &data, - FLAGS_batch_size); - - // Prepare data for NativePredictor - std::vector> native_inputs; - SetInput(&native_inputs); - std::vector native_outputs; - std::vector analysis_outputs; - - auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1"); - // Run analysis predictor - - int num_ops; - auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); - ASSERT_TRUE(fuse_statis.count("fc_fuse")); - ASSERT_EQ(fuse_statis.at("fc_fuse"), 1); - ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM - ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); - ASSERT_EQ(num_ops, - 13); // After graph optimization, only 13 operators exists. - - Timer timer; - double total_time{0}; - for (int i = 0; i < FLAGS_repeat; i++) { - timer.tic(); - predictor->ZeroCopyRun(); - total_time += timer.toc(); - } - LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor); - - ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs)); - LOG(INFO) << "native output " << DescribeTensor(native_outputs.front()); - - int output_size{0}; // this is the number of elements not memory size - auto *zero_copy_data = output_tensor->data(&place, &output_size); - auto *native_data = static_cast(native_outputs.front().data.data()); - for (int i = 0; i < output_size; i++) { - EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3); - } -} - -TEST(Analyzer_rnn1, ZeroCopyMultiThread) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - -#define NEW_TENSOR(name__) \ - auto name__##_tensor = predictor->GetInputTensor(#name__); - - auto base_predictor = CreatePaddlePredictor(config); - double total_time_of_threads{0}; - std::vector threads; - - for (int tid = 0; tid < FLAGS_num_threads; tid++) { - threads.emplace_back([&, tid] { - // To ensure the thread binding correctly, - // please clone inside the threadpool. - auto predictor = base_predictor->Clone(); - NEW_TENSOR(data_lod_attention); - NEW_TENSOR(cell_init); - NEW_TENSOR(data); - NEW_TENSOR(week); - NEW_TENSOR(minute); - NEW_TENSOR(hidden_init); - - // Prepare data for AnalysisPredictor - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - Timer timer; - double total_time{0}; - - for (int i = 0; i < FLAGS_repeat; i++) { - PrepareZeroCopyInputs(data_lod_attention_tensor.get(), - cell_init_tensor.get(), data_tensor.get(), - hidden_init_tensor.get(), week_tensor.get(), - minute_tensor.get(), &data, FLAGS_batch_size); - - timer.tic(); - predictor->ZeroCopyRun(); - total_time += timer.toc(); - } - - total_time_of_threads += total_time; - - LOG(INFO) << "thread time: " << total_time / FLAGS_repeat; - }); - } - - for (auto &t : threads) { - t.join(); - } +// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy +TEST(Analyzer_rnn1, compare_zero_copy) { + AnalysisConfig cfg; + SetConfig(&cfg); - LOG(INFO) << "average time: " - << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat; + std::vector> input_slots_all; + SetInput(&input_slots_all); + std::vector outputs_name; + outputs_name.emplace_back("final_output.tmp_1"); + CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), + input_slots_all, outputs_name); } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index dd953e0dccbb3749bfcc87966453c6976dfefa10..19fa5528da4d11d2eb1a2f932f60a84c3f5468e7 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -56,14 +56,14 @@ struct DataRecord { std::vector slot_data; split_to_float(data[1], ' ', &slot_data); std::string name = data[0]; - PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0, + PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0UL, "line %d, %s should be divisible", num_lines, name); datasets[name].emplace_back(std::move(slot_data)); } num_samples = num_lines / num_slots; PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast(num_lines), "num samples should be divisible"); - PADDLE_ENFORCE_GT(num_samples, 0); + PADDLE_ENFORCE_GT(num_samples, 0UL); } void Prepare(int bs) { @@ -144,6 +144,9 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) { cfg->SwitchSpecifyInputNames(); cfg->SwitchIrDebug(); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); + if (FLAGS_zero_copy) { + cfg->SwitchUseFeedFetchOps(false); + } if (use_mkldnn) { cfg->EnableMKLDNN(); } @@ -184,10 +187,10 @@ TEST(Analyzer_seq_pool1, compare_determine) { input_slots_all); } -void analysis_fuse_statis(bool use_zerocopy) { +// Check the fuse status +TEST(Analyzer_seq_pool1, fuse_statis) { AnalysisConfig cfg; SetConfig(&cfg); - cfg.SwitchUseFeedFetchOps(!use_zerocopy); int num_ops; auto predictor = CreatePaddlePredictor(cfg); auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); @@ -203,135 +206,17 @@ void analysis_fuse_statis(bool use_zerocopy) { EXPECT_EQ(num_ops, 171); } -// Check the fuse status -TEST(Analyzer_seq_pool1, fuse_statis) { analysis_fuse_statis(false); } - -void PrepareZeroCopyInputs( - const std::unique_ptr &predictor, - std::vector> *inputs) { - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - // only feed one batch - const auto &one_batch = data.NextBatch(); - inputs->clear(); - for (size_t i = 0; i < one_batch.size(); ++i) { - auto &slot = one_batch[i]; - auto tensor = predictor->GetInputTensor(slot.name + "_embed"); - tensor->Reshape(slot.shape); - tensor->SetLoD({slot.lod}); - ZeroCopyTensorAssignData(tensor.get(), slot.data); - inputs->emplace_back(std::move(tensor)); - } -} - -// return the output values -std::vector zerocopy_profile(int repeat_times) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - auto predictor = CreatePaddlePredictor(config); - std::vector> inputs; - PrepareZeroCopyInputs(predictor, &inputs); - auto output_tensor = predictor->GetOutputTensor(out_var_name); - Timer timer; - LOG(INFO) << "Warm up run..."; - timer.tic(); - predictor->ZeroCopyRun(); - PrintTime(FLAGS_batch_size, 1, 1, 0, timer.toc(), 1); - if (FLAGS_profile) { - paddle::platform::ResetProfiler(); - } - LOG(INFO) << "Run " << repeat_times << " times..."; - timer.tic(); - for (int i = 0; i < repeat_times; i++) { - predictor->ZeroCopyRun(); - } - PrintTime(FLAGS_batch_size, repeat_times, 1, 0, timer.toc() / repeat_times, - 1); - - LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor); - PaddlePlace place; - int output_size{0}; - auto *pdata = output_tensor->data(&place, &output_size); - std::vector res(output_size); - for (int i = 0; i < output_size; ++i) { - res[i] = pdata[i]; - } - return res; -} - -TEST(Analyzer_seq_pool1, zerocopy_profile) { zerocopy_profile(FLAGS_repeat); } - -TEST(Analyzer_seq_pool1, zerocopy_profile_threads) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - - auto base_predictor = CreatePaddlePredictor(config); - double total_time_of_threads{0}; - std::vector threads; - - for (int tid = 0; tid < FLAGS_num_threads; tid++) { - threads.emplace_back([&, tid] { - // To ensure the thread binding correctly, - // please clone inside the threadpool. - auto predictor = base_predictor->Clone(); - std::vector> inputs; - PrepareZeroCopyInputs(predictor, &inputs); - auto output_tensor = predictor->GetOutputTensor(out_var_name); - Timer timer; - double total_time{0}; - - LOG(INFO) << "Warm up run..."; - timer.tic(); - predictor->ZeroCopyRun(); - PrintTime(FLAGS_batch_size, 1, FLAGS_num_threads, tid, timer.toc(), 1); - if (FLAGS_profile) { - paddle::platform::ResetProfiler(); - } - int repeat_times = FLAGS_repeat; - LOG(INFO) << "Run " << repeat_times << " times..."; - timer.tic(); - - for (int i = 0; i < repeat_times; i++) { - predictor->ZeroCopyRun(); - } - total_time += timer.toc(); - total_time_of_threads += total_time; - - LOG(INFO) << "thread time: " << total_time / repeat_times; - }); - } - - for (auto &t : threads) { - t.join(); - } - - LOG(INFO) << "average time: " - << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat; -} - -TEST(Analyzer_seq_pool1, zerocopy_fuse_statis) { analysis_fuse_statis(true); } +// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy +TEST(Analyzer_seq_pool1, compare_zero_copy) { + AnalysisConfig cfg; + SetConfig(&cfg); -TEST(Analyzer_seq_pool1, zerocopy_compare_native) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(true); - auto predictor = CreatePaddlePredictor(config.ToNativeConfig()); - std::vector native_outputs; std::vector> input_slots_all; SetInput(&input_slots_all); - ASSERT_TRUE(predictor->Run(input_slots_all[0], &native_outputs)); - EXPECT_EQ(native_outputs.size(), 1UL); - - auto zerocopy_output = zerocopy_profile(1); - EXPECT_EQ(zerocopy_output.size() * sizeof(float), - native_outputs.front().data.length()); - auto *native_data = static_cast(native_outputs.front().data.data()); - for (size_t i = 0; i < zerocopy_output.size(); ++i) { - EXPECT_LT( - std::fabs((zerocopy_output[i] - native_data[i]) / zerocopy_output[i]), - 1e-3); - } + std::vector outputs_name; + outputs_name.emplace_back(out_var_name); + CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), + input_slots_all, outputs_name); } } // namespace analysis diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..9d17f38ab764148d4e1a63124289425c7e7aa983 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc @@ -0,0 +1,220 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +struct DataRecord { + std::vector> src_word, src_pos, trg_word, init_idx; + std::vector> src_slf_attn_bias, init_score, + trg_src_attn_bias; + std::vector> batch_data_shape; + std::vector> lod; + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= src_word.size()) { + data.src_word.assign(src_word.begin() + batch_iter, + src_word.begin() + batch_end); + data.src_pos.assign(src_pos.begin() + batch_iter, + src_pos.begin() + batch_end); + data.src_slf_attn_bias.assign(src_slf_attn_bias.begin() + batch_iter, + src_slf_attn_bias.begin() + batch_end); + data.trg_word.assign(trg_word.begin() + batch_iter, + trg_word.begin() + batch_end); + data.init_score.assign(init_score.begin() + batch_iter, + init_score.begin() + batch_end); + data.init_idx.assign(init_idx.begin() + batch_iter, + init_idx.begin() + batch_end); + data.trg_src_attn_bias.assign(trg_src_attn_bias.begin() + batch_iter, + trg_src_attn_bias.begin() + batch_end); + std::vector batch_shape = + *(batch_data_shape.begin() + batch_iter); + data.batch_data_shape.push_back(batch_shape); + data.lod.resize(2); + for (int i = 0; i < batch_shape[0] + 1; i++) { + data.lod[0].push_back(i); + data.lod[1].push_back(i); + } + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + size_t num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, ',', &data); + CHECK_EQ(data.size(), static_cast(8)); + // load src_word + std::vector src_word_data; + split_to_int64(data[0], ' ', &src_word_data); + src_word.push_back(std::move(src_word_data)); + // load src_pos + std::vector src_pos_data; + split_to_int64(data[1], ' ', &src_pos_data); + src_pos.push_back(std::move(src_pos_data)); + // load src_slf_attn_bias + std::vector src_slf_attn_bias_data; + split_to_float(data[2], ' ', &src_slf_attn_bias_data); + src_slf_attn_bias.push_back(std::move(src_slf_attn_bias_data)); + // load trg_word + std::vector trg_word_data; + split_to_int64(data[3], ' ', &trg_word_data); + trg_word.push_back(std::move(trg_word_data)); + // load init_score + std::vector init_score_data; + split_to_float(data[4], ' ', &init_score_data); + init_score.push_back(std::move(init_score_data)); + // load init_idx + std::vector init_idx_data; + split_to_int64(data[5], ' ', &init_idx_data); + init_idx.push_back(std::move(init_idx_data)); + // load trg_src_attn_bias + std::vector trg_src_attn_bias_data; + split_to_float(data[6], ' ', &trg_src_attn_bias_data); + trg_src_attn_bias.push_back(std::move(trg_src_attn_bias_data)); + // load shape for variant data shape + std::vector batch_data_shape_data; + split_to_int(data[7], ' ', &batch_data_shape_data); + batch_data_shape.push_back(std::move(batch_data_shape_data)); + } + num_samples = num_lines; + } +}; + +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + auto one_batch = data->NextBatch(); + batch_size = one_batch.batch_data_shape[0][0]; + auto n_head = one_batch.batch_data_shape[0][1]; + auto trg_seq_len = one_batch.batch_data_shape[0][2]; // 1 for inference + auto src_seq_len = one_batch.batch_data_shape[0][3]; + + PaddleTensor src_word, src_pos, src_slf_attn_bias, trg_word, init_score, + init_idx, trg_src_attn_bias; + + src_word.name = "src_word"; + src_word.shape.assign({batch_size, src_seq_len, 1}); + src_word.dtype = PaddleDType::INT64; + TensorAssignData(&src_word, one_batch.src_word); + + src_pos.name = "src_pos"; + src_pos.shape.assign({batch_size, src_seq_len, 1}); + src_pos.dtype = PaddleDType::INT64; + TensorAssignData(&src_pos, one_batch.src_pos); + + src_slf_attn_bias.name = "src_slf_attn_bias"; + src_slf_attn_bias.shape.assign( + {batch_size, n_head, src_seq_len, src_seq_len}); + src_slf_attn_bias.dtype = PaddleDType::FLOAT32; + TensorAssignData(&src_slf_attn_bias, one_batch.src_slf_attn_bias); + + trg_word.name = "trg_word"; + trg_word.shape.assign({batch_size, 1}); + trg_word.dtype = PaddleDType::INT64; + trg_word.lod.assign(one_batch.lod.begin(), one_batch.lod.end()); + TensorAssignData(&trg_word, one_batch.trg_word); + + init_score.name = "init_score"; + init_score.shape.assign({batch_size, 1}); + init_score.dtype = PaddleDType::FLOAT32; + init_score.lod.assign(one_batch.lod.begin(), one_batch.lod.end()); + TensorAssignData(&init_score, one_batch.init_score); + + init_idx.name = "init_idx"; + init_idx.shape.assign({batch_size}); + init_idx.dtype = PaddleDType::INT32; + TensorAssignData(&init_idx, one_batch.init_idx); + + trg_src_attn_bias.name = "trg_src_attn_bias"; + trg_src_attn_bias.shape.assign( + {batch_size, n_head, trg_seq_len, src_seq_len}); + trg_src_attn_bias.dtype = PaddleDType::FLOAT32; + TensorAssignData(&trg_src_attn_bias, one_batch.trg_src_attn_bias); + + input_slots->assign({src_word, src_pos, src_slf_attn_bias, trg_word, + init_score, init_idx, trg_src_attn_bias}); +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); +} + +void SetInput(std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int test_batch_num = + FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1; + LOG(INFO) << "The number of samples to be test: " + << test_batch_num * FLAGS_batch_size; + for (int bid = 0; bid < test_batch_num; ++bid) { + input_slots.clear(); + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + (*inputs).emplace_back(input_slots); + } +} + +// Easy for profiling independently. +TEST(Analyzer_Transformer, profile) { + AnalysisConfig cfg; + SetConfig(&cfg); + std::vector outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); +} + +// Check the fuse status +TEST(Analyzer_Transformer, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); +} + +// Compare result of NativeConfig and AnalysisConfig +TEST(Analyzer_Transformer, compare) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 2811eb4946ea025cf6c7ab197c4e603df86f6f2d..a4881afe58a03902556ddb8a057c5f0579e4d1d2 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -17,13 +17,14 @@ #include #include +#include #include #include // NOLINT +#include #include #ifdef WITH_GPERFTOOLS #include #endif - #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/analysis/analyzer.h" @@ -49,6 +50,7 @@ DEFINE_bool(use_analysis, true, DEFINE_bool(record_benchmark, false, "Record benchmark after profiling the model"); DEFINE_double(accuracy, 1e-3, "Result Accuracy."); +DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch."); DECLARE_bool(profile); DECLARE_int32(paddle_num_threads); @@ -66,6 +68,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { LOG(INFO) << analysis_config->ToNativeConfig(); } +// Compare result between two PaddleTensor void CompareResult(const std::vector &outputs, const std::vector &ref_outputs) { EXPECT_GT(outputs.size(), 0UL); @@ -95,6 +98,58 @@ void CompareResult(const std::vector &outputs, } break; } + case PaddleDType::INT32: { + int32_t *pdata = static_cast(out.data.data()); + int32_t *pdata_ref = static_cast(ref_out.data.data()); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } + } + } +} + +// Compare result between a PaddleTensor and a ZeroCopyTensor +void CompareResult(const std::vector &outputs, + const std::vector &ref_outputs) { + EXPECT_GT(outputs.size(), 0UL); + EXPECT_EQ(outputs.size(), ref_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &ref_out = ref_outputs[i]; + size_t size = VecReduceToInt(out.shape); + EXPECT_GT(size, 0UL); + int ref_size = 0; // this is the number of elements not memory size + PaddlePlace place; + switch (out.dtype) { + case PaddleDType::INT64: { + int64_t *pdata = static_cast(out.data.data()); + int64_t *pdata_ref = ref_out.data(&place, &ref_size); + EXPECT_EQ(size, ref_size); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } + case PaddleDType::FLOAT32: { + float *pdata = static_cast(out.data.data()); + float *pdata_ref = ref_out.data(&place, &ref_size); + EXPECT_EQ(size, ref_size); + for (size_t j = 0; j < size; ++j) { + CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy); + } + break; + } + case PaddleDType::INT32: { + int32_t *pdata = static_cast(out.data.data()); + int32_t *pdata_ref = ref_out.data(&place, &ref_size); + EXPECT_EQ(size, ref_size); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } } } } @@ -196,107 +251,127 @@ void GetInputPerBatch(const std::vector> &in, } } -void TestOneThreadPrediction( - const PaddlePredictor::Config *config, - const std::vector> &inputs, - std::vector *outputs, bool use_analysis = true) { - int batch_size = FLAGS_batch_size; - int num_times = FLAGS_repeat; - auto predictor = CreateTestPredictor(config, use_analysis); +void ConvertPaddleTensorToZeroCopyTensor( + PaddlePredictor *predictor, const std::vector &inputs) { + for (size_t i = 0; i < inputs.size(); i++) { + auto input = inputs[i]; + auto tensor = predictor->GetInputTensor(input.name); + tensor->Reshape(input.shape); + tensor->SetLoD({input.lod}); + if (input.dtype == PaddleDType::INT64) { + ZeroCopyTensorAssignData(tensor.get(), input.data); + } else if (input.dtype == PaddleDType::FLOAT32) { + ZeroCopyTensorAssignData(tensor.get(), input.data); + } else if (input.dtype == PaddleDType::INT32) { + ZeroCopyTensorAssignData(tensor.get(), input.data); + } else { + LOG(ERROR) << "unsupported feed type " << input.dtype; + } + } +} - // warmup run - LOG(INFO) << "Warm up run..."; - { - Timer warmup_timer; - warmup_timer.tic(); +void PredictionWarmUp(PaddlePredictor *predictor, + const std::vector> &inputs, + std::vector *outputs, int num_threads, + int tid) { + int batch_size = FLAGS_batch_size; + LOG(INFO) << "Running thread " << tid << ", warm up run..."; + if (FLAGS_zero_copy) { + ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]); + } + Timer warmup_timer; + warmup_timer.tic(); + if (!FLAGS_zero_copy) { predictor->Run(inputs[0], outputs, batch_size); - PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1); - if (FLAGS_profile) { - paddle::platform::ResetProfiler(); - } + } else { + predictor->ZeroCopyRun(); } + PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1); + if (FLAGS_profile) { + paddle::platform::ResetProfiler(); + } +} - LOG(INFO) << "Run " << num_times << " times..."; - { - Timer run_timer; - run_timer.tic(); +void PredictionRun(PaddlePredictor *predictor, + const std::vector> &inputs, + std::vector *outputs, int num_threads, + int tid) { + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + LOG(INFO) << "Thread " << tid << " run " << num_times << " times..."; + Timer run_timer; + double elapsed_time = 0; #ifdef WITH_GPERFTOOLS - ProfilerStart("paddle_inference.prof"); + ProfilerStart("paddle_inference.prof"); #endif - for (int i = 0; i < num_times; i++) { - for (size_t j = 0; j < inputs.size(); j++) { - predictor->Run(inputs[j], outputs, batch_size); + if (!FLAGS_zero_copy) { + run_timer.tic(); + for (size_t i = 0; i < inputs.size(); i++) { + for (int j = 0; j < num_times; j++) { + predictor->Run(inputs[i], outputs, batch_size); } } + elapsed_time = run_timer.toc(); + } else { + for (size_t i = 0; i < inputs.size(); i++) { + ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]); + run_timer.tic(); + for (int j = 0; j < num_times; j++) { + predictor->ZeroCopyRun(); + } + elapsed_time += run_timer.toc(); + } + } #ifdef WITH_GPERFTOOLS - ProfilerStop(); + ProfilerStop(); #endif - double latency = run_timer.toc() / (num_times > 1 ? num_times : 1); - PrintTime(batch_size, num_times, 1, 0, latency, inputs.size()); - if (FLAGS_record_benchmark) { - Benchmark benchmark; - benchmark.SetName(FLAGS_model_name); - benchmark.SetBatchSize(batch_size); - benchmark.SetLatency(latency); - benchmark.PersistToFile("benchmark_record.txt"); - } + PrintTime(batch_size, num_times, num_threads, tid, elapsed_time / num_times, + inputs.size()); + if (FLAGS_record_benchmark) { + Benchmark benchmark; + benchmark.SetName(FLAGS_model_name); + benchmark.SetBatchSize(batch_size); + benchmark.SetLatency(elapsed_time / num_times); + benchmark.PersistToFile("benchmark_record.txt"); } } +void TestOneThreadPrediction( + const PaddlePredictor::Config *config, + const std::vector> &inputs, + std::vector *outputs, bool use_analysis = true) { + auto predictor = CreateTestPredictor(config, use_analysis); + PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0); + PredictionRun(predictor.get(), inputs, outputs, 1, 0); +} + void TestMultiThreadPrediction( const PaddlePredictor::Config *config, const std::vector> &inputs, std::vector *outputs, int num_threads, bool use_analysis = true) { - int batch_size = FLAGS_batch_size; - int num_times = FLAGS_repeat; std::vector threads; - auto main_predictor = CreateTestPredictor(config, use_analysis); + std::vector> predictors; + predictors.emplace_back(CreateTestPredictor(config, use_analysis)); + for (int tid = 1; tid < num_threads; tid++) { + predictors.emplace_back(predictors.front()->Clone()); + } - size_t total_time{0}; for (int tid = 0; tid < num_threads; ++tid) { threads.emplace_back([&, tid]() { // Each thread should have local inputs and outputs. // The inputs of each thread are all the same. std::vector outputs_tid; - // To ensure the thread binding correctly, - // please clone inside the threadpool. - auto predictor = main_predictor->Clone(); + auto &predictor = predictors[tid]; #ifdef PADDLE_WITH_MKLDNN if (use_analysis) { static_cast(predictor.get()) ->SetMkldnnThreadID(static_cast(tid) + 1); } #endif - - // warmup run - LOG(INFO) << "Running thread " << tid << ", warm up run..."; - { - Timer warmup_timer; - warmup_timer.tic(); - predictor->Run(inputs[0], outputs, batch_size); - PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1); - if (FLAGS_profile) { - paddle::platform::ResetProfiler(); - } - } - - LOG(INFO) << "Thread " << tid << " run " << num_times << " times..."; - { - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - for (const auto &input : inputs) { - ASSERT_TRUE(predictor->Run(input, &outputs_tid)); - } - } - - auto time = timer.toc(); - total_time += time; - PrintTime(batch_size, num_times, num_threads, tid, time / num_times, - inputs.size()); - } + PredictionWarmUp(predictor.get(), inputs, outputs, num_threads, tid); + PredictionRun(predictor.get(), inputs, outputs, num_threads, tid); }); } for (int i = 0; i < num_threads; ++i) { @@ -356,6 +431,31 @@ void CompareNativeAndAnalysis( CompareResult(analysis_outputs, native_outputs); } +void CompareAnalysisAndZeroCopy( + PaddlePredictor::Config *config, + const std::vector> &inputs, + const std::vector &outputs_name) { + int batch_size = FLAGS_batch_size; + // analysis + std::vector analysis_outputs; + auto predictor = CreateTestPredictor(config, true); + predictor->Run(inputs[0], &analysis_outputs, batch_size); + // analysis + zero_copy + std::vector zerocopy_outputs; + reinterpret_cast(config)->SwitchUseFeedFetchOps(false); + predictor = CreateTestPredictor(config, true); + ConvertPaddleTensorToZeroCopyTensor(predictor.get(), inputs[0]); + predictor->ZeroCopyRun(); + for (size_t i = 0; i < outputs_name.size(); i++) { + ZeroCopyTensor zerocopy_output = + *predictor->GetOutputTensor(outputs_name[i]).get(); + zerocopy_outputs.emplace_back(zerocopy_output); + LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(zerocopy_output); + } + // compare + CompareResult(analysis_outputs, zerocopy_outputs); +} + template std::string LoDTensorSummary(const framework::LoDTensor &tensor) { std::stringstream ss; diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 17a433c9d98768dbda4ba93bdceb6cc1717adc07..cb668a4174134ba3ce9517955ff740ada568e97b 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -54,7 +54,8 @@ void SetConfig(AnalysisConfig* config, std::string model_dir, if (use_gpu) { config->EnableUseGpu(100, 0); if (use_tensorrt) { - config->EnableTensorRtEngine(1 << 10, batch_size); + config->EnableTensorRtEngine(1 << 10, batch_size, 3, + AnalysisConfig::Precision::kFloat32, false); config->pass_builder()->DeletePass("conv_bn_fuse_pass"); config->pass_builder()->DeletePass("fc_fuse_pass"); config->pass_builder()->TurnOnDebug(); diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index 29f0f034a2aab50330d4d0127b870a5cb00d56a5..f551b322fe00892be79dd966235504bb4f54c718 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -1,23 +1,49 @@ +include(ExternalProject) set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url") set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING "A path setting inference demo download directories.") -function (inference_download install_dir url filename) - message(STATUS "Download inference test stuff from ${url}/${filename}") - file(DOWNLOAD "${url}/${filename}" "${install_dir}/${filename}") - message(STATUS "finish downloading ${filename}") + +function(inference_download INSTALL_DIR URL FILENAME) + message(STATUS "Download inference test stuff from ${URL}/${FILENAME}") + string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME}) + ExternalProject_Add( + extern_inference_download_${FILENAME_EX} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${INSTALL_DIR} + URL ${URL}/${FILENAME} + DOWNLOAD_COMMAND wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} + DOWNLOAD_DIR ${INSTALL_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND "" + ) endfunction() -function (inference_download_and_uncompress install_dir url filename) - inference_download(${install_dir} ${url} ${filename}) - execute_process( - COMMAND ${CMAKE_COMMAND} -E tar xzf ${install_dir}/${filename} - WORKING_DIRECTORY ${install_dir} - ) +function(inference_download_and_uncompress INSTALL_DIR URL FILENAME) + message(STATUS "Download inference test stuff from ${URL}/${FILENAME}") + string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME}) + set(EXTERNAL_PROJECT_NAME "extern_inference_download_${FILENAME_EX}") + set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}") + ExternalProject_Add( + ${EXTERNAL_PROJECT_NAME} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${INSTALL_DIR} + DOWNLOAD_COMMAND wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && + ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME} + DOWNLOAD_DIR ${INSTALL_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND "" + ) endfunction() set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") -if (NOT EXISTS ${WORD2VEC_INSTALL_DIR}) - inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") +if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32) + inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") endif() set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model") diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 75fa611c0d701dd81dfe5b33231655e0959c7dbf..861f69f4d2143b16bdec546d92ce7bd13ca53ed3 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -171,9 +171,7 @@ void TestInference(const std::string& dirname, // Enable the profiler paddle::platform::EnableProfiler(state); { - paddle::platform::RecordEvent record_event( - "init_program", - paddle::platform::DeviceContextPool::Instance().Get(place)); + paddle::platform::RecordEvent record_event("init_program"); inference_program = InitProgram(&executor, scope, dirname, is_combined); } @@ -230,9 +228,7 @@ void TestInference(const std::string& dirname, // Run repeat times to profile the performance for (int i = 0; i < repeat; ++i) { - paddle::platform::RecordEvent record_event( - "run_inference", - paddle::platform::DeviceContextPool::Instance().Get(place)); + paddle::platform::RecordEvent record_event("run_inference"); if (PrepareContext) { // Note: if you change the inference_program, you need to call diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index e7268077643c3988c59a52bf54873f1e8db4619b..7eb663ea280e65f3c10304aa47c9970df099b901 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -1,6 +1,6 @@ add_subdirectory(detail) add_subdirectory(allocation) -cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade) +cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade profiler) cc_library(memcpy SRCS memcpy.cc DEPS place) cc_library(memory diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 4b7b9064dcde9b5209264257d51bbd976ba8eb85..7c44e18f8f39cfcdf749441ba7530e5227c44b5f 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator) cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) -cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator) +cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator) if (WITH_GPU) diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index e983ae327d69389526ae4cae226b0eb324759700..c233bf4edf5462dc48f6c3f4f22a517a03585b45 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/memory/allocation/legacy_allocator.h" - +#include #include #include #include @@ -23,9 +22,11 @@ #endif #include "glog/logging.h" +#include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/split.h" @@ -328,18 +329,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { } // namespace legacy namespace allocation { - LegacyMemMonitor GPUMemMonitor; Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_); - return new Allocation(ptr, size, place_); + auto *tmp_alloc = new Allocation(ptr, size, place_); + platform::MemEvenRecorder::Instance().PushMemRecord( + static_cast(tmp_alloc), place_, size); + return tmp_alloc; } void LegacyAllocator::Free(Allocation *allocation) { boost::apply_visitor( legacy::FreeVisitor(allocation->ptr(), allocation->size()), allocation->place()); + platform::MemEvenRecorder::Instance().PopMemRecord( + static_cast(allocation), place_); delete allocation; } @@ -356,7 +361,7 @@ void MemInfo::Minus(const size_t &size) { usage_ -= size; } -uint64_t MemInfo::GetPeakUsage() { return peak_usage_; } +uint64_t MemInfo::GetPeakUsage() const { return peak_usage_; } LegacyMemMonitor::~LegacyMemMonitor() { for (auto &item : gpu_mem_info_) delete item.second; @@ -380,10 +385,10 @@ void LegacyMemMonitor::Minus(const int &device, const size_t &size) { gpu_mem_info_[device]->Minus(size); } -uint64_t LegacyMemMonitor::GetMemUsage(const int &device) { +uint64_t LegacyMemMonitor::GetMemUsage(const int &device) const { return gpu_mem_info_.find(device) == gpu_mem_info_.end() ? 0 - : gpu_mem_info_[device]->GetPeakUsage(); + : gpu_mem_info_.at(device)->GetPeakUsage(); } void LegacyMemMonitor::PrintMemUsage() { diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h index ccbc8c70d8e9a16e7edb1be54bf80bec3b368eca..d9bdae153da6439598f76f5cac226897e6e0c596 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.h +++ b/paddle/fluid/memory/allocation/legacy_allocator.h @@ -27,20 +27,20 @@ namespace allocation { class MemInfo { public: MemInfo() : usage_(0), peak_usage_(0) {} - MemInfo(const MemInfo &) = delete; - MemInfo &operator=(const MemInfo &) = delete; // return a flag to indicate current operation will create a peak point or not bool Add(const size_t &); void Minus(const size_t &); - uint64_t GetPeakUsage(); + uint64_t GetPeakUsage() const; private: /* current memory usage*/ uint64_t usage_; uint64_t peak_usage_; std::mutex mutex_; + + DISABLE_COPY_AND_ASSIGN(MemInfo); }; class LegacyMemMonitor { @@ -56,11 +56,11 @@ class LegacyMemMonitor { void Add(const int &, const size_t &); void Minus(const int &, const size_t &); - uint64_t GetMemUsage(const int &); + uint64_t GetMemUsage(const int &) const; void PrintMemUsage(); - protected: + private: MemUsage gpu_mem_info_; }; diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 2a6f70a01e303aa1b608248cbeb8dcfa24837a0c..1408163e4b5278ddcd65eb4f2900109d772a589a 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" #include // for memcpy +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace memory { @@ -29,14 +30,23 @@ void Copy(platform::CPUPlace, void* dst, #ifdef PADDLE_WITH_CUDA static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K +// NOTE(zcd): Do not use GpuMemcpySync as much as possible. +// because GpuMemcpySync issues the copying command to the default stream, +// which will make two commands from different streams cannot run concurrently. +// Reference: +// https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/ + template <> void Copy( platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, const void* src, size_t num, cudaStream_t stream) { platform::SetDeviceId(src_place.device); + if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); } else { + platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU"); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); // FIXME(zjl): do we really need it? if (num <= kMaxGpuAsyncCopyBytes) { @@ -51,8 +61,10 @@ void Copy( const void* src, size_t num, cudaStream_t stream) { platform::SetDeviceId(dst_place.device); if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); } else { + platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU"); platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); // FIXME(zjl): do we really need it? if (num <= kMaxGpuAsyncCopyBytes) { @@ -68,15 +80,19 @@ void Copy( if (dst_place == src_place) { platform::SetDeviceId(src_place.device); if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); } else { + platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU"); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice); } } else { if (stream) { + platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU"); platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, num, stream); } else { + platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU"); platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device, num); } @@ -111,8 +127,10 @@ void Copy( cudaStream_t stream) { platform::SetDeviceId(src_place.device); if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); } else { + platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned"); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); } } @@ -124,8 +142,10 @@ void Copy( cudaStream_t stream) { platform::SetDeviceId(dst_place.device); if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); } else { + platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU"); platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); } } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index e099425b94221bf1229e936fc1781615d13dbc26..a3f2a69aef52b6f55aa09e6dee2c22c048626c0d 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -66,7 +66,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search) if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu) @@ -97,3 +97,4 @@ if (WITH_PYTHON) endif() set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") +add_subdirectory(benchmark) diff --git a/paddle/fluid/operators/activation_cudnn.cu.cc b/paddle/fluid/operators/activation_cudnn.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..494c02374a9faa22486644c9b9c7d586c86d41b0 --- /dev/null +++ b/paddle/fluid/operators/activation_cudnn.cu.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/platform/cudnn_desc.h" + +namespace paddle { +namespace operators { +using framework::Tensor; +using platform::ActivationDescriptor; +using platform::TensorDescriptor; + +template +class CudnnActivationKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + framework::Tensor *X, *Out; + ExtractActivationTensor(context, X, Out); + ActivationDescriptor act_desc; + TensorDescriptor x_desc, out_desc; + x_desc.set(detail::Ref(X)); + out_desc.set(detail::Ref(Out)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..a382414d5c473a9c36f92a9af56837da819e96a4 --- /dev/null +++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/platform/cudnn_desc.h" + +namespace paddle { +namespace operators { +using framework::Tensor; +using platform::ActivationDescriptor; +using platform::TensorDescriptor; +using platform::CUDADeviceContext; + +template +struct CudnnActivationFunctor { + using ELEMENT_TYPE = T; + CudnnActivationFunctor(const CUDADeviceContext& ctx, const T& c, + const cudnnActivationMode_t& m) + : ctx_(ctx), coef_(c), mode_(m) {} + void operator()(const Tensor& x, Tensor* out) { + ActivationDescriptor act_desc; + act_desc.set(mode_, coef_); + TensorDescriptor x_desc, out_desc; + x_desc.set(x); + out_desc.set(detail::Ref(out)); + PADDLE_ENFORCE(platform::dynload::cudnnActivationForward( + ctx_.cudnn_handle(), act_desc.desc(), + platform::CudnnDataType::kOne(), x_desc.desc(), x.data(), + platform::CudnnDataType::kZero(), out_desc.desc(), + out->mutable_data(ctx_.GetPlace()))); + } + const CUDADeviceContext& ctx_; + const T coef_; + const cudnnActivationMode_t mode_; +}; + +template +struct CudnnActivationGradFunctor { + using ELEMENT_TYPE = T; + CudnnActivationGradFunctor(const CUDADeviceContext& ctx, const T& c, + const cudnnActivationMode_t& m) + : ctx_(ctx), coef_(c), mode_(m) {} + void operator()(const Tensor& x, const Tensor& out, const Tensor dout, + Tensor* dx) { + ActivationDescriptor act_desc; + act_desc.set(mode_, coef_); + TensorDescriptor x_desc, out_desc, dout_desc, dx_desc; + x_desc.set(x); + out_desc.set(out); + dout_desc.set(dout); + dx_desc.set(detail::Ref(dx)); + PADDLE_ENFORCE(platform::dynload::cudnnActivationBackward( + ctx_.cudnn_handle(), act_desc.desc(), + platform::CudnnDataType::kOne(), out_desc.desc(), out.data(), + dout_desc.desc(), dout.data(), x_desc.desc(), x.data(), + platform::CudnnDataType::kZero(), dx_desc.desc(), + dx->mutable_data(ctx_.GetPlace()))); + } + const CUDADeviceContext& ctx_; + const T coef_; + const cudnnActivationMode_t mode_; +}; + +template +struct CudnnReluFunctor : public CudnnActivationFunctor { + explicit CudnnReluFunctor(const CUDADeviceContext& ctx) + : CudnnActivationFunctor(ctx, 0.0, CUDNN_ACTIVATION_RELU) {} +}; +template +struct CudnnReluGradFunctor : public CudnnActivationGradFunctor { + explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx) + : CudnnActivationGradFunctor(ctx, 0.0, CUDNN_ACTIVATION_RELU) {} +}; + +template +struct CudnnRelu6Functor : public CudnnActivationFunctor { + explicit CudnnRelu6Functor(const CUDADeviceContext& ctx) + : CudnnActivationFunctor(ctx, 6.0, CUDNN_ACTIVATION_CLIPPED_RELU) {} +}; +template +struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor { + explicit CudnnRelu6GradFunctor(const CUDADeviceContext& ctx) + : CudnnActivationGradFunctor(ctx, 6.0, CUDNN_ACTIVATION_CLIPPED_RELU) { + } +}; + +template +struct CudnnSigmoidFunctor : public CudnnActivationFunctor { + explicit CudnnSigmoidFunctor(const CUDADeviceContext& ctx) + : CudnnActivationFunctor(ctx, 0.0, CUDNN_ACTIVATION_SIGMOID) {} +}; +template +struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor { + explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx) + : CudnnActivationGradFunctor(ctx, 0.0, CUDNN_ACTIVATION_SIGMOID) {} +}; + +template +struct CudnnTanhFunctor : public CudnnActivationFunctor { + explicit CudnnTanhFunctor(const CUDADeviceContext& ctx) + : CudnnActivationFunctor(ctx, 0.0, CUDNN_ACTIVATION_TANH) {} +}; +template +struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor { + explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx) + : CudnnActivationGradFunctor(ctx, 0.0, CUDNN_ACTIVATION_TANH) {} +}; + +template +class CudnnActivationKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* X = nullptr; + framework::Tensor* Out = nullptr; + ExtractActivationTensor(context, &X, &Out); + Out->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + Functor functor(dev_ctx); + functor(detail::Ref(X), Out); + } +}; + +template +class CudnnActivationGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor *X, *Out, *dOut; + X = Out = dOut = nullptr; + framework::Tensor* dX = nullptr; + ExtractActivationGradTensor(context, &X, &Out, &dOut, &dX); + dX->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + Functor functor(dev_ctx); + functor(detail::Ref(X), detail::Ref(Out), detail::Ref(dOut), dX); + } +}; + +} // namespace operators +} // namespace paddle + +namespace plat = paddle::platform; +namespace ops = paddle::operators; + +#define FOR_EACH_CUDNN_OP_FUNCTOR(__macro) \ + __macro(relu, CudnnReluFunctor, CudnnReluGradFunctor); \ + __macro(relu6, CudnnRelu6Functor, CudnnRelu6GradFunctor); \ + __macro(sigmoid, CudnnTanhFunctor, CudnnTanhGradFunctor); \ + __macro(tanh, CudnnTanhFunctor, CudnnTanhGradFunctor) + +#define REGISTER_ACTIVATION_CUDNN_KERNEL(act_type, functor, grad_functor) \ + REGISTER_OP_KERNEL(act_type, CUDNN, plat::CUDAPlace, \ + ops::CudnnActivationKernel>, \ + ops::CudnnActivationKernel>); \ + REGISTER_OP_KERNEL( \ + act_type##_grad, CUDNN, plat::CUDAPlace, \ + ops::CudnnActivationGradKernel>, \ + ops::CudnnActivationGradKernel>); + +FOR_EACH_CUDNN_OP_FUNCTOR(REGISTER_ACTIVATION_CUDNN_KERNEL); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 65efe2966ce12e86ba7f4944eb57ae72cdf9796f..f79960317aa1bac7ae9f8d80e4886dde8fe8ebcb 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -13,32 +13,41 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" +#include #include +#include #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" #include "paddle/fluid/platform/port.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_helper.h" +#endif namespace paddle { namespace operators { using paddle::framework::Tensor; -#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \ - class OP_NAME##OpMaker \ - : public ::paddle::framework::OpProtoAndCheckerMaker { \ - public: \ - void Make() override { \ - AddInput("X", "Input of " #OP_NAME " operator"); \ - AddOutput("Out", "Output of " #OP_NAME " operator"); \ - AddAttr("use_mkldnn", \ - "(bool, default false) Only used in mkldnn kernel") \ - .SetDefault(false); \ - AddAttr( \ - "is_test", \ - "(bool, default false) Set to true for inference only, false " \ - "for training. Some layers may run faster when this is true.") \ - .SetDefault(false); \ - AddComment(OP_COMMENT); \ - } \ +#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \ + class OP_NAME##OpMaker \ + : public ::paddle::framework::OpProtoAndCheckerMaker { \ + public: \ + void Make() override { \ + AddInput("X", "Input of " #OP_NAME " operator"); \ + AddOutput("Out", "Output of " #OP_NAME " operator"); \ + AddAttr("use_mkldnn", \ + "(bool, default false) Only used in mkldnn kernel") \ + .SetDefault(false); \ + AddAttr("use_cudnn", \ + "(bool, default false) Only used in cudnn kernel, need " \ + "install cudnn") \ + .SetDefault(false); \ + AddAttr( \ + "is_test", \ + "(bool, default false) Set to true for inference only, false " \ + "for training. Some layers may run faster when this is true.") \ + .SetDefault(false); \ + AddComment(OP_COMMENT); \ + } \ } #define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE) \ @@ -67,6 +76,12 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, const std::string& name) { framework::LibraryType library{framework::LibraryType::kPlain}; framework::DataLayout layout = framework::DataLayout::kAnyLayout; +#ifdef PADDLE_WITH_CUDA + auto it1 = oper.Attrs().find("use_cudnn"); + if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) { + library = framework::LibraryType::kCUDNN; + } +#endif #ifdef PADDLE_WITH_MKLDNN auto it = oper.Attrs().find("use_mkldnn"); if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() && @@ -256,6 +271,48 @@ $$out = \\frac{x}{1 + \|x\|}$$ )DOC"; +class AcosOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of acos operator"); + AddOutput("Out", "Output of acos operator"); + AddComment(R"DOC( +Arccosine Activation Operator. + +$$out = \cos^{-1}(x)$$ + +)DOC"); + } +}; + +class AsinOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of asin operator"); + AddOutput("Out", "Output of asin operator"); + AddComment(R"DOC( +Arcsine Activation Operator. + +$$out = \sin^{-1}(x)$$ + +)DOC"); + } +}; + +class AtanOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of atan operator"); + AddOutput("Out", "Output of atan operator"); + AddComment(R"DOC( +Arctanh Activation Operator. + +$$out = \tanh^{-1}(x)$$ + +)DOC"); + } +}; + class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -530,7 +587,10 @@ namespace ops = paddle::operators; __macro(SoftShrink, softshrink); \ __macro(Abs, abs); \ __macro(Cos, cos); \ + __macro(Acos, acos); \ __macro(Sin, sin); \ + __macro(Asin, asin); \ + __macro(Atan, atan); \ __macro(Round, round); \ __macro(Log, log); \ __macro(Square, square); \ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index c7df3ea58a91579e35ff0d486516271a6daf054f..ff7e623f6f383ed2a8b8a40b3186d9c439ff1d86 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -11,6 +11,7 @@ limitations under the License. */ #pragma once #include +#include #include #include #include @@ -24,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/platform/float16.h" #ifdef PADDLE_WITH_MKLDNN @@ -37,9 +39,20 @@ namespace operators { Please refer to the layer_helper.py and get the details. */ static std::unordered_set InplaceOpSet = { - "sigmoid", "exp", "relu", "tanh", "sqrt", "ceil", - "floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid", -}; + "sigmoid", "exp", "relu", "tanh", "sqrt", "ceil", + "floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid"}; + +static bool IsInplace(const std::string& op) { + bool inplace = InplaceOpSet.count(op); + // for op_grad + const int kGradSuffixLen = 4; + if (op.size() > kGradSuffixLen && + op.compare(op.size() - kGradSuffixLen - 1, kGradSuffixLen, "grad")) { + inplace = + InplaceOpSet.count(op.substr(0, op.size() - (kGradSuffixLen + 1))); + } + return inplace; +} /* The following operator can be used to process SelectedRows, because the * output of those operator for zero is zero too. @@ -47,47 +60,97 @@ static std::unordered_set InplaceOpSet = { static std::unordered_set CanBeUsedBySelectedRows = { "abs", "abs_grad", "square", "square_grad", "sqrt", "sqrt_grad"}; -static bool IsInplace(std::string op) { return InplaceOpSet.count(op); } - -template -class ActivationKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - - void Compute(const framework::ExecutionContext& context) const override { +inline void ExtractActivationTensor(const framework::ExecutionContext& context, + const framework::Tensor** X, + framework::Tensor** Out) { + auto x_var = context.InputVar("X"); + auto out_var = context.OutputVar("Out"); + PADDLE_ENFORCE(x_var != nullptr, + "Cannot get input Variable X, variable name = %s", + context.op().Input("X")); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot get output Variable Out, variable name = %s", + context.op().Output("Out")); + if (CanBeUsedBySelectedRows.count(context.op().Type())) { + *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var); + *Out = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( + out_var); + } else { + *X = context.Input("X"); + *Out = context.Output("Out"); + } + + PADDLE_ENFORCE(*Out != nullptr, + "Cannot get output tensor Out, variable name = %s", + context.op().Output("Out")); +} + +inline void ExtractActivationGradTensor( + const framework::ExecutionContext& context, const framework::Tensor** X, + const framework::Tensor** Out, const framework::Tensor** dOut, + framework::Tensor** dX) { + auto out_var = context.InputVar("Out"); + auto out_grad_var = context.InputVar(framework::GradVarName("Out")); + auto x_grad_var = context.OutputVar(framework::GradVarName("X")); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot get input Variable Out, variable name = %s", + context.op().Input("Out")); + PADDLE_ENFORCE(out_grad_var != nullptr, + "Cannot get input Variable %s, variable name = %s", + framework::GradVarName("Out"), + context.op().Input(framework::GradVarName("Out"))); + PADDLE_ENFORCE(x_grad_var != nullptr, + "Cannot get output Variable %s, variable name = %s", + framework::GradVarName("X"), + context.op().Output(framework::GradVarName("X"))); + + if (CanBeUsedBySelectedRows.count(context.op().Type())) { + *Out = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var); + *dOut = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar( + *out_grad_var); + *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( + x_grad_var); + } else { + *Out = context.Input("Out"); + *dOut = context.Input(framework::GradVarName("Out")); + *dX = context.Output(framework::GradVarName("X")); + } + PADDLE_ENFORCE(*dX != nullptr, + "Cannot get output tensor %s, variable name = %s", + framework::GradVarName("X"), + context.op().Output(framework::GradVarName("X"))); + + bool inplace = IsInplace(context.op().Type()); + if (!inplace) { auto x_var = context.InputVar("X"); - auto out_var = context.OutputVar("Out"); PADDLE_ENFORCE(x_var != nullptr, - "Cannot get input Variable X, variable name = %s", + "Cannot get input tensor X, variable name = %s", context.op().Input("X")); - PADDLE_ENFORCE(out_var != nullptr, - "Cannot get output Variable Out, variable name = %s", - context.op().Output("Out")); - - framework::Tensor X, *Out; - if (CanBeUsedBySelectedRows.count(context.op().Type())) { - X = detail::Ref( - paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var), - "Cannot get input Tensor X, variable name = %s", - context.op().Input("X")); - Out = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - out_var); + *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var); } else { - X = detail::Ref(context.Input("X"), - "Cannot get input Tensor X, variable name = %s", - context.op().Input("X")); - Out = context.Output("Out"); + *X = context.Input("X"); } + } else { + VLOG(10) << " Inplace activation of Op : " << context.op().Type(); + *X = *dX; + } +} - PADDLE_ENFORCE(Out != nullptr, - "Cannot get output tensor Out, variable name = %s", - context.op().Output("Out")); +template +class ActivationKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* X = nullptr; + framework::Tensor* Out = nullptr; + ExtractActivationTensor(context, &X, &Out); Out->mutable_data(context.GetPlace()); - auto x = framework::EigenVector::Flatten(X); - auto out = framework::EigenVector::Flatten(*Out); + + auto x = framework::EigenVector::Flatten(detail::Ref(X)); + auto out = framework::EigenVector::Flatten(detail::Ref(Out)); auto* place = context.template device_context().eigen_device(); Functor functor; @@ -106,55 +169,15 @@ class ActivationGradKernel public: using T = typename Functor::ELEMENT_TYPE; void Compute(const framework::ExecutionContext& context) const override { - auto out_var = context.InputVar("Out"); - auto out_grad_var = context.InputVar(framework::GradVarName("Out")); - auto x_grad_var = context.OutputVar(framework::GradVarName("X")); - PADDLE_ENFORCE(out_var != nullptr, - "Cannot get input Variable Out, variable name = %s", - context.op().Input("Out")); - PADDLE_ENFORCE(out_grad_var != nullptr, - "Cannot get input Variable %s, variable name = %s", - framework::GradVarName("Out"), - context.op().Input(framework::GradVarName("Out"))); - PADDLE_ENFORCE(x_grad_var != nullptr, - "Cannot get output Variable %s, variable name = %s", - framework::GradVarName("X"), - context.op().Output(framework::GradVarName("X"))); - - framework::Tensor Out, dOut, *dX; - if (CanBeUsedBySelectedRows.count(context.op().Type())) { - Out = detail::Ref( - paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var), - "Cannot get input Tensor Out, variable name = %s", - context.op().Input("Out")); - dOut = - detail::Ref(paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar( - *out_grad_var), - "Cannot get input Tensor %s, variable name = %s", - framework::GradVarName("Out"), - context.op().Input(framework::GradVarName("Out"))); - dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - x_grad_var); - } else { - Out = detail::Ref(context.Input("Out"), - "Cannot get input Tensor Out, variable name = %s", - context.op().Input("Out")); - dOut = detail::Ref( - context.Input(framework::GradVarName("Out")), - "Cannot get input Tensor %s, variable name = %s", - framework::GradVarName("Out"), - context.op().Input(framework::GradVarName("Out"))); - dX = context.Output(framework::GradVarName("X")); - } - PADDLE_ENFORCE(dX != nullptr, - "Cannot get output tensor %s, variable name = %s", - framework::GradVarName("X"), - context.op().Output(framework::GradVarName("X"))); + const framework::Tensor *X, *Out, *dOut; + framework::Tensor* dX = nullptr; + X = Out = dOut = nullptr; + ExtractActivationGradTensor(context, &X, &Out, &dOut, &dX); dX->mutable_data(context.GetPlace()); - - auto dout = framework::EigenVector::Flatten(dOut); - auto out = framework::EigenVector::Flatten(Out); - auto dx = framework::EigenVector::Flatten(*dX); + auto dout = framework::EigenVector::Flatten(detail::Ref(dOut)); + auto out = framework::EigenVector::Flatten(detail::Ref(Out)); + auto dx = framework::EigenVector::Flatten(detail::Ref(dX)); + auto x = framework::EigenVector::Flatten(detail::Ref(X)); auto* place = context.template device_context().eigen_device(); Functor functor; @@ -162,27 +185,7 @@ class ActivationGradKernel for (auto& attr : attrs) { *attr.second = context.Attr(attr.first); } - bool inplace = functor.Inplace(); - if (!inplace) { - auto x_var = context.InputVar("X"); - PADDLE_ENFORCE(x_var != nullptr, - "Cannot get input tensor X, variable name = %s", - context.op().Input("X")); - framework::Tensor X; - if (CanBeUsedBySelectedRows.count(context.op().Type())) { - X = detail::Ref( - paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var)); - } else { - X = detail::Ref(context.Input("X")); - } - - auto x = framework::EigenVector::Flatten(X); - functor(*place, x, out, dout, dx); - } else { - VLOG(10) << " Inplace activation "; - auto x = framework::EigenVector::Flatten(*dX); - functor(*place, x, out, dout, dx); - } + functor(*place, x, out, dout, dx); } }; @@ -214,7 +217,6 @@ struct SigmoidFunctor : public BaseActivationFunctor { template struct SigmoidGradFunctor : public BaseActivationFunctor { - bool Inplace() const { return IsInplace("sigmoid"); } template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { @@ -269,7 +271,6 @@ struct ExpFunctor : public BaseActivationFunctor { template struct ExpGradFunctor : public BaseActivationFunctor { - bool Inplace() const { return IsInplace("exp"); } template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { @@ -288,7 +289,6 @@ struct ReluFunctor : public BaseActivationFunctor { template struct ReluGradFunctor : public BaseActivationFunctor { - bool Inplace() const { return IsInplace("relu"); } template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { @@ -301,8 +301,28 @@ template struct GeluFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { +// Because the execute or device context can not be deliver here, it keep the +// marco for NVCC. +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) + auto x_data = x.data(); + auto out_data = out.data(); + int n = std::min(x.size(), out.size()); + + std::memset(out_data, 0, n * sizeof(T)); + math::CBlas::AXPY(n, static_cast(M_SQRT1_2), x_data, 1, out_data, 1); + math::CBlas::VMERF(n, out_data, out_data, VML_LA); + for (int i = 0; i < n; i++) { + out_data[i] += static_cast(1); + } + math::CBlas::VMUL(n, x_data, out_data, out_data); + for (int i = 0; i < n; i++) { + out_data[i] *= static_cast(0.5); + } +#else auto temp = (x * static_cast(M_SQRT1_2)).erf(); out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); +#endif } }; @@ -331,7 +351,6 @@ struct TanhFunctor : public BaseActivationFunctor { template struct TanhGradFunctor : public BaseActivationFunctor { - bool Inplace() const { return IsInplace("tanh"); } template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { @@ -437,7 +456,6 @@ struct SqrtFunctor : public BaseActivationFunctor { template struct SqrtGradFunctor : public BaseActivationFunctor { - bool Inplace() const { return IsInplace("sqrt"); } template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { @@ -456,7 +474,6 @@ struct CeilFunctor : public BaseActivationFunctor { template struct ZeroGradFunctor : public BaseActivationFunctor { - bool Inplace() const { return IsInplace("ceil"); } template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { @@ -535,6 +552,101 @@ struct SinFunctor : public BaseActivationFunctor { } }; +template +struct Acos { + HOSTDEVICE T operator()(const T& val) const { return acos(val); } +}; + +template <> +struct Acos { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(acos(static_cast(val))); + } +}; + +// Acos(x) = acos(x) +template +struct AcosFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Acos()); + } +}; + +// acos'(x) = -1/sqrt(1-x^2) +template +struct AcosGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + -dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); + } +}; + +template +struct Asin { + HOSTDEVICE T operator()(const T& val) const { return asin(val); } +}; + +template <> +struct Asin { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(asin(static_cast(val))); + } +}; + +// Asin(x) = asin(x) +template +struct AsinFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Asin()); + } +}; + +// asin'(x) = 1/sqrt(1-x^2) +template +struct AsinGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); + } +}; + +template +struct Atan { + HOSTDEVICE T operator()(const T& val) const { return atan(val); } +}; + +template <> +struct Atan { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(atan(static_cast(val))); + } +}; + +// Atan(x) = atan(x) +template +struct AtanFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Atan()); + } +}; + +// atan'(x) = 1 / (1 + x^2) +template +struct AtanGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(1) / (static_cast(1) + x.square()); + } +}; + // round(x) = [x] template struct RoundFunctor : public BaseActivationFunctor { @@ -573,7 +685,6 @@ struct ReciprocalFunctor : public BaseActivationFunctor { template struct ReciprocalGradFunctor : public BaseActivationFunctor { - bool Inplace() const { return IsInplace("reciprocal"); } template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { @@ -673,7 +784,6 @@ struct Relu6GradFunctor : public BaseActivationFunctor { typename BaseActivationFunctor::AttrPair GetAttrs() { return {{"threshold", &threshold}}; } - bool Inplace() const { return IsInplace("relu6"); } template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { @@ -755,7 +865,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor { typename BaseActivationFunctor::AttrPair GetAttrs() { return {{"threshold", &threshold}}; } - bool Inplace() const { return IsInplace("soft_relu"); } template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { @@ -936,7 +1045,6 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor { typename BaseActivationFunctor::AttrPair GetAttrs() { return {{"slope", &slope}, {"offset", &offset}}; } - bool Inplace() { return IsInplace("hard_sigmoid"); } template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { @@ -987,13 +1095,16 @@ struct SwishGradFunctor : public BaseActivationFunctor { __macro(relu, ReluFunctor, ReluGradFunctor); \ __macro(gelu, GeluFunctor, GeluGradFunctor); \ __macro(tanh, TanhFunctor, TanhGradFunctor); \ + __macro(atan, AtanFunctor, AtanGradFunctor); \ __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ __macro(abs, AbsFunctor, AbsGradFunctor); \ __macro(ceil, CeilFunctor, ZeroGradFunctor); \ __macro(floor, FloorFunctor, ZeroGradFunctor); \ __macro(cos, CosFunctor, CosGradFunctor); \ + __macro(acos, AcosFunctor, AcosGradFunctor); \ __macro(sin, SinFunctor, SinGradFunctor); \ + __macro(asin, AsinFunctor, AsinGradFunctor); \ __macro(round, RoundFunctor, ZeroGradFunctor); \ __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(log, LogFunctor, LogGradFunctor); \ diff --git a/paddle/fluid/operators/alloc_continuous_space_op.cc b/paddle/fluid/operators/alloc_continuous_space_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..df0e9911cf7186e952cfd7fbf7f43889e9098c84 --- /dev/null +++ b/paddle/fluid/operators/alloc_continuous_space_op.cc @@ -0,0 +1,211 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +static framework::proto::VarType::Type kDefaultDtype = + framework::proto::VarType::Type::VarType_Type_BOOL; + +template +class AllocContinuousSpaceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto &in_var_names = context.Inputs("Input"); + auto &out_var_names = context.Outputs("Output"); + auto &in_vars = context.MultiInputVar("Input"); + auto out_vars = context.MultiOutputVar("Output"); + + PADDLE_ENFORCE_GT(in_var_names.size(), static_cast(0)); + PADDLE_ENFORCE_EQ(in_var_names.size(), out_var_names.size()); + + for (size_t i = 0; i < in_var_names.size(); ++i) { + // Only support LoDTensor + PADDLE_ENFORCE_NOT_NULL(in_vars[i], "%s should not be nullptr,", + in_var_names[i]); + PADDLE_ENFORCE_NOT_NULL(out_vars[i], "%s should not be nullptr,", + out_var_names[i]); + PADDLE_ENFORCE(in_vars[i]->IsType()); + PADDLE_ENFORCE(out_vars[i]->IsType()); + } + + auto in_tensors = context.MultiInput("Input"); + + if (context.Attr("check_name")) { + for (size_t i = 0; i < in_var_names.size(); ++i) { + PADDLE_ENFORCE_EQ(in_var_names[i], out_var_names[i]); + } + } else { + // Init the output as input + for (size_t i = 0; i < in_tensors.size(); ++i) { + out_vars[i]->GetMutable()->Resize( + in_tensors[i]->dims()); + } + } + + auto &dev_ctx = context.template device_context(); + + // Get numel and dtype + size_t numel = 0; + auto dtype = kDefaultDtype; + GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype); + + // Alloc the continuous space + auto fused_tensor = context.Output("FusedOutput"); + fused_tensor->Resize(framework::make_ddim({static_cast(numel)})) + .mutable_data(context.GetPlace(), dtype); + + // Init the continuous space + auto out_tensors = context.MultiOutput("Output"); + int64_t offset = 0; + if (context.Attr("copy_data")) { + for (size_t i = 0; i < in_var_names.size(); ++i) { + int64_t len = out_tensors[i]->numel(); + auto sub_tensor = fused_tensor->Slice(offset, offset + len); + offset += len; + framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx, + &sub_tensor); + } + } else if (context.Attr("set_constant")) { + math::SetConstant set_constant; + set_constant(dev_ctx, fused_tensor, + static_cast(context.Attr("constant"))); + } + + // Make the outputs point to the continuous space. + offset = 0; + for (size_t i = 0; i < out_tensors.size(); ++i) { + int64_t len = out_tensors[i]->numel(); + auto dim = out_tensors[i]->dims(); + out_tensors[i] + ->ShareDataWith(fused_tensor->Slice(offset, offset + len)) + .Resize(dim); + offset += len; + VLOG(10) << "alloc_space_for_vars: output(" << out_var_names[i] + << ") ,dim:(" << dim << ")" + << " Address: " << out_tensors[i]->data(); + } + } + + void GetMemSizeAndDtype( + const std::vector &lod_tensors, + const std::vector var_names, size_t *numel, + framework::proto::VarType::Type *dtype) const { + PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size()); + *numel = 0; + for (size_t i = 0; i < var_names.size(); ++i) { + PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.", + var_names[i]); + + auto p_dtype = lod_tensors[i]->type(); + if (*dtype == kDefaultDtype) { + PADDLE_ENFORCE_NE(p_dtype, kDefaultDtype, "%s's type should not be %s.", + var_names[i], kDefaultDtype); + *dtype = p_dtype; + } + PADDLE_ENFORCE_EQ(p_dtype, *dtype, "Input vars is not equal."); + + auto size = lod_tensors[i]->numel(); + PADDLE_ENFORCE_GT(size, 0); + VLOG(10) << "alloc_space_for_vars: input(" << var_names[i] << ") ,dim:(" + << lod_tensors[i]->dims() << ")"; + *numel += size; + } + } +}; + +class AllocContinuousSpaceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override {} +}; + +class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", + "(vector) The input tensors of" + " alloc_continuous_space operator.") + .AsDuplicable(); + AddOutput("Output", + "(vector) The output " + "tensors of alloc_continuous_space operator. And the address " + "of output tensors are continuous, they are sliced from the " + "tensor of FusedOutput.") + .AsDuplicable(); + AddOutput("FusedOutput", + "(LoDTensor) The output tensor " + "of alloc_continuous_space operator. And the tensors of" + " Output is sliced from the tensor of FusedOutput."); + AddAttr("copy_data", "Whether to copy the Input value to Output.") + .SetDefault(false); + AddAttr("set_constant", + "Whether to set the Output with a constant value.") + .SetDefault(false); + AddAttr("constant", + "If set_constant is true, the constant value will be used " + "to set the Output.") + .SetDefault(0.0); + AddAttr("check_name", + "Whether to check the name of Input and Output to ensure " + "they are the same separately.") + .SetDefault(false); + AddComment(R"DOC( +AllocContinuousSpace Operator. + +alloc_continuous_space is used to make the address of Output +continuous according to the Input. This Op will alloc a big tensor +according to the tensors of Input, the dtype is the same with those input tensors, +the size is the sum of those input tensors' numel, and the dim of the big +tensor is {sum(numel)}. And the big tensor is stored in FusedOutput. +The tensors of Output are sliced from the tensor of FusedOutput. +Note that, the dtype of Input should be the same, and the dim of Input +and Output should equal. +The tensors of Input and Output could be the same or different. And +alloc_continuous_space allows copying the value of Input to Output, or +setting the Output with a constant value. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(alloc_continuous_space, + paddle::operators::AllocContinuousSpaceOp, + paddle::operators::AllocContinuousSpaceOpMaker); +namespace ops = paddle::operators; +REGISTER_OP_CPU_KERNEL( + alloc_continuous_space, + ops::AllocContinuousSpaceKernel, + ops::AllocContinuousSpaceKernel, + ops::AllocContinuousSpaceKernel); + +#ifdef PADDLE_WITH_CUDA +REGISTER_OP_CUDA_KERNEL( + alloc_continuous_space, + ops::AllocContinuousSpaceKernel, + ops::AllocContinuousSpaceKernel, + ops::AllocContinuousSpaceKernel); +#endif diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index b6996be4b0984bcee3b16da268d79708a68b65b3..912ec79910301b67bc520b1aa78d3fa1fd165d1f 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -293,7 +293,7 @@ class AttentionLSTMKernel : public framework::OpKernel { int len = x_lod[0][i + 1] - x_lod[0][i]; max_seq_len = max_seq_len < len ? len : max_seq_len; } - PADDLE_ENFORCE_EQ(x_lod.size(), 1, "Input(X)'s lod size must be 1."); + PADDLE_ENFORCE_EQ(x_lod.size(), 1UL, "Input(X)'s lod size must be 1."); PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D); fc_out->Resize({max_seq_len, 1}); diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index 7f2bde55c98277b9fd4b3374657001c42d673d43..cf78c83297a87beb08a8b8e6e4b182f03f1909d3 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -123,7 +123,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase { auto& dev_ctx = *pool.Get(dev_place); framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope); - framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx); + framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx, nullptr); const LoDTensorArray* ids = ctx.Input("Ids"); const LoDTensorArray* scores = ctx.Input("Scores"); diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h index 6aefc5446f167eebb0da673b3fbdf7ed128daa98..0b883c3158fb922caae2e731875bbb8d43a1e9ca 100644 --- a/paddle/fluid/operators/beam_search_decode_op.h +++ b/paddle/fluid/operators/beam_search_decode_op.h @@ -122,7 +122,7 @@ void BeamSearchDecoder::ConvertSentenceVectorToLodTensor( auto cpu_place = std::unique_ptr( new paddle::platform::CPUPlace()); - paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place.get()); + paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place); framework::LoD lod; lod.push_back(source_level_lod); diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc index e93cd8615e052e4dfc6255549bf7a9b84b7dd657..fa6b09b4e7ec58624c91f1e4f428871232c0a083 100644 --- a/paddle/fluid/operators/beam_search_op.cc +++ b/paddle/fluid/operators/beam_search_op.cc @@ -51,9 +51,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("selected_scores", "A LoDTensor containing the accumulated scores corresponding to " "Output(selected_ids)."); - AddOutput( - "parent_idx", - "A Tensor preserving the selected_ids' parent indice in pre_ids."); + AddOutput("parent_idx", + "A Tensor preserving the selected_ids' parent indice in pre_ids.") + .AsDispensable(); // Attributes stored in AttributeMap AddAttr("level", "the level of LoDTensor"); diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h index f808020cc765585d1633c6c3bf528080a7e83f07..3d32ea0cc9686a709b185087d76d12f266663d03 100644 --- a/paddle/fluid/operators/beam_search_op.h +++ b/paddle/fluid/operators/beam_search_op.h @@ -44,7 +44,6 @@ class BeamSearchOpKernel : public framework::OpKernel { auto* parent_idx = context.Output("parent_idx"); PADDLE_ENFORCE_NOT_NULL(selected_ids); PADDLE_ENFORCE_NOT_NULL(selected_scores); - PADDLE_ENFORCE_NOT_NULL(parent_idx); math::BeamSearchFunctor alg; alg(context.template device_context(), pre_ids, pre_scores, diff --git a/paddle/fluid/operators/benchmark/CMakeLists.txt b/paddle/fluid/operators/benchmark/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..54008336a9f67f0123ba1cfa6fcea35b79b7ac4c --- /dev/null +++ b/paddle/fluid/operators/benchmark/CMakeLists.txt @@ -0,0 +1,3 @@ +cc_test(op_tester SRCS op_tester.cc op_tester_config.cc + DEPS memory timer framework_proto proto_desc lod_tensor op_registry + device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..fec091255f6391b77cd2858905f3aa2e5dd8baff --- /dev/null +++ b/paddle/fluid/operators/benchmark/op_tester.cc @@ -0,0 +1,509 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/benchmark/op_tester.h" +#include +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/timer.h" +#include "paddle/fluid/pybind/pybind.h" + +namespace paddle { +namespace operators { +namespace benchmark { + +DEFINE_string(op_config_list, "", "Path of op config file."); +DEFINE_int32(specified_config_id, -1, "Test the specified op config."); + +void OpTester::Init(const std::string &filename) { + Init(OpTesterConfig(filename)); +} + +void OpTester::Init(const OpTesterConfig &config) { + config_ = config; + + auto &op_desc_info = framework::OpInfoMap::Instance(); + // Initialize the OpDesc + if (op_desc_info.Has(config_.op_type)) { + type_ = config_.op_type; + + CreateOpDesc(); + CreateInputVarDesc(); + CreateOutputVarDesc(); + } else { + LOG(FATAL) << "Op \"" << config_.op_type << "\" is not registered."; + } + + if (config_.device_id >= 0) { + place_ = paddle::platform::CUDAPlace(config_.device_id); + } else { + place_ = paddle::platform::CPUPlace(); + } + + framework::InitDevices(false); + scope_.reset(new paddle::framework::Scope()); + + op_ = framework::OpRegistry::CreateOp(op_desc_); + CreateVariables(scope_.get()); +} + +void OpTester::Run() { + if (config_.print_debug_string) { + LOG(INFO) << DebugString(); + } + + // Warm up + RunImpl(); + + platform::Timer timer; + if (config_.profile) { + if (platform::is_cpu_place(place_)) { + platform::EnableProfiler(platform::ProfilerState::kCPU); + } else { +#ifdef PADDLE_WITH_CUDA + platform::EnableProfiler(platform::ProfilerState::kAll); + platform::SetDeviceId(config_.device_id); +#else + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); +#endif + } + + timer.Start(); + for (int i = config_.repeat; i > 0; --i) { + RunImpl(); + } + timer.Pause(); + platform::DisableProfiler(platform::EventSortingKey::kDefault, + "op_tester_profiler"); + } else { + timer.Start(); + for (int i = config_.repeat; i > 0; --i) { + RunImpl(); + } + timer.Pause(); + } + config_.runtime = timer.ElapsedMS() / config_.repeat; + LOG(INFO) << "=== Run " << config_.repeat + << " times, latency: " << config_.runtime << " ms ==="; +} + +void OpTester::RunImpl() { + op_->Run(*scope_, place_); + platform::DeviceContextPool::Instance().Get(place_)->Wait(); + scope_->DropKids(); +} + +std::vector OpTester::GetOpProtoInputNames() { + std::vector input_names; + const framework::proto::OpProto &proto = + framework::OpInfoMap::Instance().Get(type_).Proto(); + for (int i = 0; i != proto.inputs_size(); ++i) { + const auto &input = proto.inputs(i); + input_names.push_back(input.name()); + } + return input_names; +} + +std::vector OpTester::GetOpProtoOutputNames() { + std::vector output_names; + const framework::proto::OpProto &proto = + framework::OpInfoMap::Instance().Get(type_).Proto(); + for (int i = 0; i != proto.outputs_size(); ++i) { + const auto &output = proto.outputs(i); + output_names.push_back(output.name()); + } + return output_names; +} + +std::unordered_map +OpTester::GetOpProtoAttrNames() { + std::unordered_map attr_types; + const framework::proto::OpProto &proto = + framework::OpInfoMap::Instance().Get(type_).Proto(); + const std::vector skipped_attrs = { + framework::OpProtoAndCheckerMaker::OpRoleAttrName(), + framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), + framework::OpProtoAndCheckerMaker::OpNamescopeAttrName(), + framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName()}; + for (int i = 0; i != proto.attrs_size(); ++i) { + const auto &attr = proto.attrs(i); + if (!Has(skipped_attrs, attr.name())) { + VLOG(4) << "attr: " << attr.name() << ", type: " << attr.type(); + attr_types[attr.name()] = attr.type(); + } + } + return attr_types; +} + +framework::proto::VarType::Type OpTester::TransToVarType(std::string str) { + if (str == "int32") { + return framework::proto::VarType::INT32; + } else if (str == "int64") { + return framework::proto::VarType::INT64; + } else if (str == "fp32") { + return framework::proto::VarType::FP32; + } else if (str == "fp64") { + return framework::proto::VarType::FP64; + } else { + PADDLE_THROW("Unsupported dtype %s.", str.c_str()); + } +} + +void OpTester::CreateInputVarDesc() { + std::vector input_names = GetOpProtoInputNames(); + for (auto &name : input_names) { + const OpInputConfig *input = config_.GetInput(name); + if (input == nullptr) { + LOG(FATAL) << "The input " << name << " of op " << config_.op_type + << " is not correctlly provided."; + } + + std::string var_name = config_.op_type + "." + name; + framework::VarDesc *var = Var(var_name); + // Need to support more type + var->SetType(framework::proto::VarType::LOD_TENSOR); + var->SetPersistable(false); + var->SetDataType(TransToVarType(input->dtype)); + var->SetShape(input->dims); + + op_desc_.SetInput(name, {var_name}); + inputs_[var_name] = *input; + } +} + +void OpTester::CreateOutputVarDesc() { + std::vector output_names = GetOpProtoOutputNames(); + for (auto &name : output_names) { + std::string var_name = config_.op_type + "." + name; + framework::VarDesc *var = Var(var_name); + // Need to support more type + var->SetType(framework::proto::VarType::LOD_TENSOR); + var->SetPersistable(false); + var->SetDataType(framework::proto::VarType::FP32); + + op_desc_.SetOutput(name, {var_name}); + } +} + +void OpTester::CreateOpDesc() { + op_desc_.SetType(config_.op_type); + std::unordered_map attr_types = + GetOpProtoAttrNames(); + for (auto item : config_.attrs) { + const std::string &name = item.first; + if (attr_types.find(name) == attr_types.end()) { + LOG(FATAL) << "Operator " << type_ << " do not have attr " << name; + } + + const std::string &value_str = item.second; + const framework::proto::AttrType &type = attr_types[name]; + switch (type) { + case framework::proto::AttrType::BOOLEAN: + break; + case framework::proto::AttrType::INT: { + int value = StringTo(value_str); + op_desc_.SetAttr(name, {value}); + } break; + case framework::proto::AttrType::FLOAT: { + float value = StringTo(value_str); + op_desc_.SetAttr(name, {value}); + } break; + case framework::proto::AttrType::STRING: { + op_desc_.SetAttr(name, {value_str}); + } break; + case framework::proto::AttrType::BOOLEANS: + case framework::proto::AttrType::INTS: + case framework::proto::AttrType::FLOATS: + case framework::proto::AttrType::STRINGS: + LOG(FATAL) << "Not supported yet."; + break; + case framework::proto::AttrType::LONG: { + int64_t value = StringTo(value_str); + op_desc_.SetAttr(name, value); + } break; + case framework::proto::AttrType::LONGS: + default: + PADDLE_THROW("Unsupport attr type %d", type); + } + } +} + +framework::VarDesc *OpTester::Var(const std::string &name) { + auto it = vars_.find(name); + if (it != vars_.end()) { + return it->second.get(); + } + auto *var = new framework::VarDesc(name); + vars_[name].reset(var); + return var; +} + +template +void OpTester::SetupTensor(framework::LoDTensor *tensor, + const std::vector &shape, T lower, T upper, + const std::string &initializer) { + static unsigned int seed = 100; + std::mt19937 rng(seed++); + std::uniform_real_distribution uniform_dist(0, 1); + + T *ptr = tensor->mutable_data(framework::make_ddim(shape), place_); + + framework::LoDTensor cpu_tensor; + T *cpu_ptr = nullptr; + + if (!platform::is_cpu_place(place_)) { + cpu_ptr = cpu_tensor.mutable_data(framework::make_ddim(shape), + platform::CPUPlace()); + } else { + cpu_ptr = ptr; + } + + if (initializer == "random") { + for (int i = 0; i < cpu_tensor.numel(); ++i) { + cpu_ptr[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); + } + } else if (initializer == "natural") { + for (int i = 0; i < cpu_tensor.numel(); ++i) { + cpu_ptr[i] = lower + i; + } + } else if (initializer == "zeros") { + for (int i = 0; i < cpu_tensor.numel(); ++i) { + cpu_ptr[i] = 0; + } + } else { + PADDLE_THROW("Unsupported initializer %s.", initializer.c_str()); + } + + if (!platform::is_cpu_place(place_)) { + TensorCopySync(cpu_tensor, place_, tensor); + } +} + +void OpTester::CreateVariables(framework::Scope *scope) { + for (auto &item : vars_) { + auto &var = item.second; + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + auto *ptr = scope->Var(var->Name()); + framework::InitializeVariable(ptr, var->GetType()); + if (var->Persistable()) { + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } + } + + for (auto &item : inputs_) { + // Allocate memory for input tensor + auto &var_name = item.first; + VLOG(3) << "Allocate memory for tensor " << var_name; + + auto &var_desc = vars_[var_name]; + std::vector shape = var_desc->GetShape(); + + auto *var = scope->Var(var_name); + auto *tensor = var->GetMutable(); + const auto &data_type = var_desc->GetDataType(); + if (data_type == framework::proto::VarType::INT32) { + SetupTensor(tensor, shape, 0, 1, item.second.initializer); + } else if (data_type == framework::proto::VarType::INT64) { + SetupTensor(tensor, shape, 0, 1, item.second.initializer); + } else if (data_type == framework::proto::VarType::FP32) { + SetupTensor(tensor, shape, static_cast(0.0), + static_cast(1.0), item.second.initializer); + } else if (data_type == framework::proto::VarType::FP64) { + SetupTensor(tensor, shape, static_cast(0.0), + static_cast(1.0), item.second.initializer); + } else { + PADDLE_THROW("Unsupported dtype %d.", data_type); + } + + VLOG(3) << "Set lod for tensor " << var_name; + std::vector> &lod_vec = item.second.lod; + framework::LoD lod; + for (size_t i = 0; i < lod_vec.size(); ++i) { + lod.push_back(lod_vec[i]); + } + tensor->set_lod(lod); + } +} + +static std::string GenSpaces(int count) { + std::stringstream ss; + for (int i = 0; i < count; ++i) { + ss << " "; + } + return ss.str(); +} + +std::string OpTester::DebugString() { + std::stringstream ss; + int count = 0; + for (auto &item : vars_) { + auto &var = item.second; + ss << GenSpaces(count++) << "vars {\n"; + ss << GenSpaces(count) << "name: \"" << var->Name() << "\"\n"; + ss << GenSpaces(count++) << "type: {\n"; + ss << GenSpaces(count) << "type: LOD_TENSOR\n"; + ss << GenSpaces(count++) << "lod_tensor {\n"; + ss << GenSpaces(count++) << "tensor {\n"; + const auto &data_type = var->GetDataType(); + if (data_type == framework::proto::VarType::INT32) { + ss << GenSpaces(count) << "data_type: INT32\n"; + } else if (data_type == framework::proto::VarType::INT64) { + ss << GenSpaces(count) << "data_type: INT64\n"; + } else if (data_type == framework::proto::VarType::FP32) { + ss << GenSpaces(count) << "data_type: FP32\n"; + } else if (data_type == framework::proto::VarType::FP64) { + ss << GenSpaces(count) << "data_type: FP64\n"; + } + std::vector shape = var->GetShape(); + for (auto d : shape) { + ss << GenSpaces(count) << "dims: " << d << "\n"; + } + ss << GenSpaces(--count) << "}\n"; + ss << GenSpaces(--count) << "}\n"; + ss << GenSpaces(--count) << "}\n"; + ss << GenSpaces(count) << "persistable: " << var->Persistable() << "\n"; + ss << GenSpaces(--count) << "}\n"; + } + ss << GenSpaces(count++) << "ops {\n"; + for (auto &name : op_desc_.InputNames()) { + ss << GenSpaces(count++) << "inputs {\n"; + ss << GenSpaces(count) << "parameters: \"" << name << "\"\n"; + ss << GenSpaces(count) << "arguments: \"" << op_desc_.Input(name)[0] + << "\"\n"; + ss << GenSpaces(--count) << "}\n"; + } + for (auto &name : op_desc_.OutputNames()) { + ss << GenSpaces(count++) << "outputs {\n"; + ss << GenSpaces(count) << "parameters: \"" << name << "\"\n"; + ss << GenSpaces(count) << "arguments: \"" << op_desc_.Output(name)[0] + << "\"\n"; + ss << GenSpaces(--count) << "}\n"; + } + ss << GenSpaces(count) << "type: " << op_desc_.Type() << "\n"; + for (auto &name : op_desc_.AttrNames()) { + ss << GenSpaces(count++) << "attrs {\n"; + const auto &attr_type = op_desc_.GetAttrType(name); + const auto &attr = op_desc_.GetAttr(name); + ss << GenSpaces(count) << "name: \"" << name << "\"\n"; + switch (attr_type) { + case framework::proto::AttrType::BOOLEAN: { + ss << GenSpaces(count) << "type: BOOLEAN\n"; + ss << GenSpaces(count) << "b: " << boost::get(attr) << "\n"; + } break; + case framework::proto::AttrType::INT: { + ss << GenSpaces(count) << "type: INT\n"; + ss << GenSpaces(count) << "i: " << boost::get(attr) << "\n"; + } break; + case framework::proto::AttrType::FLOAT: { + ss << GenSpaces(count) << "type: FLOAT\n"; + ss << GenSpaces(count) << "f: " << boost::get(attr) << "\n"; + } break; + case framework::proto::AttrType::STRING: { + ss << GenSpaces(count) << "type: STRING\n"; + ss << GenSpaces(count) << "s: \"" << boost::get(attr) + << "\"\n"; + } break; + case framework::proto::AttrType::BOOLEANS: { + ss << GenSpaces(count) << "type: BOOLEANS\n"; + ss << GenSpaces(count) << "bools: " + << "\n"; + } break; + case framework::proto::AttrType::INTS: { + ss << GenSpaces(count) << "type: INTS\n"; + ss << GenSpaces(count) << "ints: " + << "\n"; + } break; + case framework::proto::AttrType::FLOATS: { + ss << GenSpaces(count) << "type: FLOATS\n"; + ss << GenSpaces(count) << "floats: " + << "\n"; + } break; + case framework::proto::AttrType::STRINGS: { + ss << GenSpaces(count) << "type: STRINGS\n"; + ss << GenSpaces(count) << "strings: " + << "\n"; + } break; + case framework::proto::AttrType::LONG: { + ss << GenSpaces(count) << "type: LONG\n"; + ss << GenSpaces(count) << "l: " << boost::get(attr) << "\n"; + } break; + case framework::proto::AttrType::LONGS: { + ss << GenSpaces(count) << "type: LONGS\n"; + ss << GenSpaces(count) << "longs: " + << "\n"; + } break; + default: + PADDLE_THROW("Unsupport attr type %d", attr_type); + } + ss << GenSpaces(--count) << "}\n"; + } + ss << GenSpaces(--count) << "}\n"; + return ss.str(); +} + +TEST(op_tester, base) { + if (!FLAGS_op_config_list.empty()) { + std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", + FLAGS_op_config_list.c_str()); + std::vector op_configs; + while (!fin.eof()) { + VLOG(4) << "Reading config " << op_configs.size() << "..."; + OpTesterConfig config; + bool result = config.Init(fin); + if (result) { + op_configs.push_back(config); + } + } + if (FLAGS_specified_config_id >= 0 && + FLAGS_specified_config_id < static_cast(op_configs.size())) { + OpTester tester; + tester.Init(op_configs[FLAGS_specified_config_id]); + tester.Run(); + } else { + for (size_t i = 0; i < op_configs.size(); ++i) { + OpTester tester; + tester.Init(op_configs[i]); + tester.Run(); + } + } + } else { + OpTester tester; + OpTesterConfig config; + config.op_type = "elementwise_add"; + config.inputs.resize(2); + config.inputs[0].name = "X"; + config.inputs[0].dims = {64, 64}; + config.inputs[1].name = "Y"; + config.inputs[1].dims = {64, 1}; + tester.Init(config); + tester.Run(); + } +} + +} // namespace benchmark +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/benchmark/op_tester.h b/paddle/fluid/operators/benchmark/op_tester.h new file mode 100644 index 0000000000000000000000000000000000000000..328389293c4b71a2f1fefbc3bf26fd46b79ec6e2 --- /dev/null +++ b/paddle/fluid/operators/benchmark/op_tester.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/benchmark/op_tester_config.h" + +namespace paddle { +namespace operators { +namespace benchmark { + +class OpTester { + public: + OpTester() {} + + void Init(const std::string &filename); + void Init(const OpTesterConfig &config); + + void Run(); + + std::string DebugString(); + + private: + std::vector GetOpProtoInputNames(); + std::vector GetOpProtoOutputNames(); + std::unordered_map + GetOpProtoAttrNames(); + + framework::proto::VarType::Type TransToVarType(std::string str); + void CreateInputVarDesc(); + void CreateOutputVarDesc(); + void CreateOpDesc(); + + framework::VarDesc *Var(const std::string &name); + void CreateVariables(framework::Scope *scope); + + template + void SetupTensor(framework::LoDTensor *input, + const std::vector &shape, T lower, T upper, + const std::string &initializer); + + void RunImpl(); + + private: + OpTesterConfig config_; + std::string type_; + framework::OpDesc op_desc_; + std::unordered_map> vars_; + std::unordered_map inputs_; + std::unique_ptr op_; + platform::Place place_; + std::unique_ptr scope_; +}; + +} // namespace benchmark +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc new file mode 100644 index 0000000000000000000000000000000000000000..b4878ab04244cf6b54d323943fc1fbf4e3882660 --- /dev/null +++ b/paddle/fluid/operators/benchmark/op_tester_config.cc @@ -0,0 +1,226 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/benchmark/op_tester_config.h" +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace benchmark { + +static const char kStartSeparator[] = "{"; +static const char kEndSeparator[] = "}"; +static const char kSepBetweenItems[] = ";"; + +static bool StartWith(const std::string& str, const std::string& substr) { + return str.find(substr) == 0; +} + +static bool EndWith(const std::string& str, const std::string& substr) { + return str.rfind(substr) == (str.length() - substr.length()); +} + +static void EraseEndSep(std::string* str, + std::string substr = kSepBetweenItems) { + if (EndWith(*str, substr)) { + str->erase(str->length() - substr.length(), str->length()); + } +} + +OpInputConfig::OpInputConfig(std::istream& is) { + std::string sep; + is >> sep; + if (sep == kStartSeparator) { + while (sep != kEndSeparator) { + is >> sep; + if (sep == "name" || sep == "name:") { + is >> name; + EraseEndSep(&name); + } else if (sep == "dtype" || sep == "dtype:") { + ParseDType(is); + } else if (sep == "initializer" || sep == "initializer:") { + ParseInitializer(is); + } else if (sep == "dims" || sep == "dims:") { + ParseDims(is); + } else if (sep == "lod" || sep == "lod:") { + ParseLoD(is); + } + } + } +} + +void OpInputConfig::ParseDType(std::istream& is) { + std::string dtype_str; + is >> dtype_str; + EraseEndSep(&dtype_str); + + if (dtype_str == "int32" || dtype_str == "int") { + dtype = "int32"; + } else if (dtype_str == "int64" || dtype_str == "long") { + dtype = "int64"; + } else if (dtype_str == "fp32" || dtype_str == "float") { + dtype = "fp32"; + } else if (dtype_str == "fp64" || dtype_str == "double") { + dtype = "fp64"; + } else { + PADDLE_THROW("Unsupported dtype %s", dtype_str.c_str()); + } + VLOG(4) << "dtype of input " << name << " is: " << dtype; +} + +void OpInputConfig::ParseInitializer(std::istream& is) { + std::string initializer_str; + is >> initializer_str; + EraseEndSep(&initializer_str); + + const std::vector supported_initializers = {"random", "natural", + "zeros"}; + if (!Has(supported_initializers, initializer_str)) { + PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str()); + } + + initializer = initializer_str; + VLOG(4) << "initializer of input " << name << " is: " << initializer; +} + +void OpInputConfig::ParseDims(std::istream& is) { + std::string dims_str; + is >> dims_str; + + dims.clear(); + std::string token; + std::istringstream token_stream(dims_str); + while (std::getline(token_stream, token, 'x')) { + dims.push_back(std::stoi(token)); + } +} + +void OpInputConfig::ParseLoD(std::istream& is) { + std::string lod_str; + std::string start_sep = + std::string(kStartSeparator) + std::string(kStartSeparator); + std::string end_sep = std::string(kEndSeparator) + std::string(kEndSeparator); + + std::string sep; + is >> sep; + if (StartWith(sep, start_sep)) { + lod_str += sep; + while (!EndWith(sep, end_sep)) { + is >> sep; + lod_str += sep; + } + } + EraseEndSep(&lod_str); + PADDLE_ENFORCE_GE(lod_str.length(), 4U); + VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length(); + + // Parse the lod_str + lod.clear(); + for (size_t i = 1; i < lod_str.length() - 1;) { + if (lod_str[i] == '{') { + std::vector level; + while (lod_str[i] != '}') { + ++i; + + std::string number; + while (lod_str[i] >= '0' && lod_str[i] <= '9') { + number += lod_str[i]; + ++i; + } + level.push_back(StringTo(number)); + } + lod.push_back(level); + } else if (lod_str[i] == '}') { + ++i; + } + } +} + +OpTesterConfig::OpTesterConfig(const std::string& filename) { + std::ifstream fin(filename, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", + filename.c_str()); + + Init(fin); +} + +bool OpTesterConfig::Init(std::istream& is) { + std::string sep; + is >> sep; + if (sep == kStartSeparator) { + while (sep != kEndSeparator) { + is >> sep; + if (sep == "op_type" || sep == "op_type:") { + is >> op_type; + } else if (sep == "device_id" || sep == "device_id:") { + is >> device_id; + } else if (sep == "repeat" || sep == "repeat:") { + is >> repeat; + } else if (sep == "profile" || sep == "profile:") { + is >> profile; + } else if (sep == "print_debug_string" || sep == "print_debug_string:") { + is >> print_debug_string; + } else if (sep == "input" || sep == "input:") { + OpInputConfig input_config(is); + inputs.push_back(input_config); + } else if (sep == "attrs" || sep == "attrs:") { + ParseAttrs(is); + } else { + if (sep != kEndSeparator) { + return false; + } + } + } + } else { + return false; + } + return true; +} + +bool OpTesterConfig::ParseAttrs(std::istream& is) { + std::string sep; + is >> sep; + if (sep == kStartSeparator) { + while (true) { + std::string key; + is >> key; + if (key == kEndSeparator) { + break; + } + + std::string value; + is >> value; + EraseEndSep(&key, ":"); + EraseEndSep(&value); + VLOG(4) << "attrs: " << key << ", " << value; + + attrs[key] = value; + } + } + return true; +} + +const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) { + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i].name == name) { + return &inputs[i]; + } + } + return nullptr; +} + +} // namespace benchmark +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/benchmark/op_tester_config.h b/paddle/fluid/operators/benchmark/op_tester_config.h new file mode 100644 index 0000000000000000000000000000000000000000..5803f82ac28867a481875c2af607290c5d366146 --- /dev/null +++ b/paddle/fluid/operators/benchmark/op_tester_config.h @@ -0,0 +1,82 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace paddle { +namespace operators { +namespace benchmark { + +struct OpInputConfig { + OpInputConfig() {} + explicit OpInputConfig(std::istream& is); + + void ParseDType(std::istream& is); + void ParseInitializer(std::istream& is); + void ParseDims(std::istream& is); + void ParseLoD(std::istream& is); + + std::string name; + std::string dtype{"fp32"}; // int32/int, int64/long, fp32/float, fp64/double + std::string initializer{"random"}; // random, natural + std::vector dims; + std::vector> lod; +}; + +struct OpTesterConfig { + OpTesterConfig() {} + explicit OpTesterConfig(const std::string& filename); + + bool Init(std::istream& is); + + bool ParseAttrs(std::istream& is); + + const OpInputConfig* GetInput(const std::string& name); + + std::string op_type; + std::vector inputs; + std::unordered_map attrs; + int device_id{-1}; // CPU: -1 + int repeat{1}; + int profile{0}; + int print_debug_string{0}; + double runtime{0.0}; +}; + +static bool Has(const std::vector& vec, const std::string& item) { + for (size_t i = 0; i < vec.size(); ++i) { + if (vec[i] == item) { + return true; + } + } + return false; +} + +template +T StringTo(const std::string& str) { + std::istringstream is(str); + T value; + is >> value; + return value; +} + +} // namespace benchmark +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc index 8d6a498dc941e44688ec8a2b49a6e080608f9b85..0c517cc757ca3f6f1ff7f4191ab2d529890b7154 100644 --- a/paddle/fluid/operators/cast_op.cc +++ b/paddle/fluid/operators/cast_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cast_op.h" +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/float16.h" @@ -30,7 +31,8 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker { Cast Operator. This Operator casts the input tensor to another data type and -returns tha Output Tensor. +returns the Output Tensor. It's meaningless if the output dtype equals +the input dtype, but it's fine if you do so. )DOC"); } diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index b614e9b03502634a29333f331e25201a0f77ba38..7aa1c44eaafe53034b19ee52c59cc94d3a1269da 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -1,4 +1,5 @@ include(operators) register_operators(DEPS naive_executor) +cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator) file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc index 688457d4a75168577302e45817ef0463d6ff3718..5d3f9b43f8c08d356319fa0b9ccaf808811d3d39 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cc +++ b/paddle/fluid/operators/controlflow/compare_op.cc @@ -51,6 +51,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker { comment.type)); AddInput("Y", string::Sprintf("the right hand operand of %s operator", comment.type)); + AddAttr( + "axis", + "The start dimension index for broadcasting Y onto X. [default -1]") + .SetDefault(-1) + .EqualGreaterThan(-1); AddAttr("force_cpu", "Force fill output variable to cpu " "memory. Otherwise, fill output variable to the running " @@ -64,11 +69,6 @@ N-dim tensor. X and Y could be any type. The each element of the Out tensor is calculated by $%s$ )DOC", comment.equation)); - AddAttr( - "axis", - "The start dimension index for broadcasting Y onto X. [default -1]") - .SetDefault(-1) - .EqualGreaterThan(-1); } }; diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc index db6ff7825690176ded0ab957764ed8411d3cd804..1a157688f3d02185d18b66ff5ba3613b6cf438ad 100644 --- a/paddle/fluid/operators/controlflow/get_places_op.cc +++ b/paddle/fluid/operators/controlflow/get_places_op.cc @@ -52,7 +52,7 @@ class GetPlacesOp : public framework::OperatorBase { device_count = is_gpu ? CUDADevCount() : std::thread::hardware_concurrency(); } - PADDLE_ENFORCE_NE(device_count, 0, "Cannot indicate %s device count", + PADDLE_ENFORCE_NE(device_count, 0UL, "Cannot indicate %s device count", is_gpu ? "GPU" : "CPU"); auto out_var_name = Output("Out"); diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 0360cf5273591946570cac47e2578e43f498b550..8352ba4f2b846af58d2d041ebf5201ee15f8481c 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/operators/detail/safe_ref.h" namespace paddle { @@ -26,14 +27,6 @@ namespace operators { using StepScopeVar = std::vector; using LoDTensor = framework::LoDTensor; -static constexpr char kStepBlock[] = "sub_block"; -static constexpr char kCondition[] = "Condition"; -static constexpr char kStepScopes[] = "StepScopes"; -static constexpr char kX[] = "X"; -static constexpr char kXGRAD[] = "X@GRAD"; -static constexpr char kOutputs[] = "Out"; -static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; - namespace { // NOLINT static std::string GetSkipEagerDeletionVarsDebugString( const std::vector &vars) { diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..2cbd94a061b5b369d67b6e0995d6b8fd45801828 --- /dev/null +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -0,0 +1,291 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/controlflow/while_op_helper.h" +#include +#include +#include +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace operators { + +// OpVariant is a wrapper class of OpDesc and OperatorBase +// So that API would be the same. +class OpVariant { + struct InputsVisitor + : public boost::static_visitor { + template + const framework::VariableNameMap *operator()(const OpType *op) const { + return &(op->Inputs()); + } + }; + + struct OutputsVisitor + : public boost::static_visitor { + template + const framework::VariableNameMap *operator()(const OpType *op) const { + return &(op->Outputs()); + } + }; + + struct AttributeMapVisitor + : public boost::static_visitor { + const framework::AttributeMap *operator()( + const framework::OpDesc *op) const { + return &(op->GetAttrMap()); + } + + const framework::AttributeMap *operator()( + const framework::OperatorBase *op) const { + return &(op->Attrs()); + } + }; + + struct RawPointerVisitor : public boost::static_visitor { + template + const void *operator()(const OpType *op) const { + return op; + } + }; + + public: + OpVariant(const framework::OperatorBase *op) : op_(op) {} // NOLINT + + OpVariant(const framework::OpDesc *op) : op_(op) {} // NOLINT + + const framework::VariableNameMap &Inputs() const { + return *boost::apply_visitor(InputsVisitor(), op_); + } + + const framework::VariableNameMap &Outputs() const { + return *boost::apply_visitor(OutputsVisitor(), op_); + } + + const framework::AttributeMap &Attrs() const { + return *boost::apply_visitor(AttributeMapVisitor(), op_); + } + + template + const AttrType &Attr(const std::string &name) const { + auto &attrs = Attrs(); + auto it = attrs.find(name); + PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name); + return boost::get(it->second); + } + + bool operator==(const OpVariant &other) const { + return RawPointer() == other.RawPointer(); + } + + const void *RawPointer() const { + return boost::apply_visitor(RawPointerVisitor(), op_); + } + + int which() const { return static_cast(op_.which()); } + + struct Hasher { + size_t operator()(const OpVariant &op) const { + return reinterpret_cast(op.RawPointer()); + } + }; + + private: + const boost::variant + op_; +}; + +static std::string GetDebugString(const std::vector &names) { + if (names.empty()) return ""; + std::string ret = names[0]; + for (size_t i = 1; i < names.size(); ++i) { + ret += (" " + names[i]); + } + return ret; +} + +// Set skip variables of while_op and while_grad_op +// These variables should be skipped when eager deletion enables. +// It is because: +// 1. while_grad_op needs some variables defined in while_op. +// 2. while_grad_op needs variables from the previous time step. +static void SetSkipVars(const OpVariant &op, std::vector attr) { + auto &attrs = const_cast(op.Attrs()); + VLOG(2) << "Prepare to skip " << attr.size() + << " var(s): " << GetDebugString(attr); + attrs[kSkipEagerDeletionVars] = std::move(attr); +} + +// Check whether the forward while_op and while_grad_op match +// The program may have many while_ops. +static bool IsMatchedWhileOpAndWhileGradOp(const OpVariant &fwd_op, + const OpVariant &grad_op) { + return fwd_op.Inputs().at(kX) == grad_op.Inputs().at(kX) && + fwd_op.Outputs().at(kOutputs) == grad_op.Inputs().at(kOutputs); +} + +// Test whether the variable is skippable in forward while_op +// The variable is skippable in while_op when the variable used in while_grad +// is not from grad_block. +static bool IsSkippableVar(const std::string &name, + framework::BlockDesc *grad_block) { + return name != framework::kEmptyVarName && !grad_block->HasVar(name); +} + +static void ModifyWhileOpAndWhileGradOpAttr(const OpVariant &fwd_op, + const OpVariant &bwd_op) { + auto *grad_block = bwd_op.Attr(kStepBlock); + + // Find all skippable variables in forward while_op + std::unordered_set forward_skip_vars; + for (auto *op_desc : grad_block->AllOps()) { + for (auto &in_arg_name : op_desc->InputArgumentNames()) { + if (IsSkippableVar(in_arg_name, grad_block)) { + forward_skip_vars.insert(in_arg_name); + } + } + + for (auto &out_arg_name : op_desc->OutputArgumentNames()) { + if (IsSkippableVar(out_arg_name, grad_block)) { + forward_skip_vars.insert(out_arg_name); + } + } + } + + SetSkipVars(fwd_op, std::vector(forward_skip_vars.begin(), + forward_skip_vars.end())); + + // Find all skippable variables in while_grad_op + // The skipped variables are those which would be used across time steps. + auto &fwd_input = fwd_op.Inputs().at(kX); + auto &in_grads = bwd_op.Outputs().at(framework::GradVarName(kX)); + PADDLE_ENFORCE_EQ( + fwd_input.size(), in_grads.size(), + "Backward input gradient number does not match forward input number."); + + std::unordered_set backward_skip_vars; + for (size_t i = 0; i < in_grads.size(); ++i) { + if (in_grads[i] == framework::kEmptyVarName) { + continue; + } + backward_skip_vars.insert(in_grads[i]); + backward_skip_vars.insert(framework::GradVarName(fwd_input[i])); + } + + SetSkipVars(bwd_op, std::vector(backward_skip_vars.begin(), + backward_skip_vars.end())); +} + +// Find all while_ops and while_grad_ops in the graph or program +// The while_grad_op and while_op may located in different blocks +// So we should traverse all blocks in the program and find them out. +static void FindAllWhileAndWhileGradOp(std::vector *while_ops, + std::vector *while_grad_ops) { + PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size()); + + if (while_ops->empty()) return; + + const auto *program = + while_ops->front().Attr(kStepBlock)->Program(); + for (size_t i = 1; i < program->Size(); ++i) { + auto &block = program->Block(i); + for (size_t j = 0; j < block.OpSize(); ++j) { + auto *op = block.Op(j); + if (op->Type() == "while") { + while_ops->emplace_back(op); + } else if (op->Type() == "while_grad") { + while_grad_ops->emplace_back(op); + } + } + } + + PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size(), + "There are extra while_grad ops in the graph or program"); +} + +static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl( + std::vector *while_ops, std::vector *while_grad_ops) { + FindAllWhileAndWhileGradOp(while_ops, while_grad_ops); + + VLOG(2) << "Found while op num: " << while_ops->size() + << ", while grad op num: " << while_grad_ops->size(); + + if (while_grad_ops->empty()) { + return; + } + + std::unordered_set while_op_set( + while_ops->begin(), while_ops->end()); + + for (auto &bwd_op : *while_grad_ops) { + const OpVariant *matched_fwd_op = nullptr; + for (auto &fwd_op : while_op_set) { + if (IsMatchedWhileOpAndWhileGradOp(fwd_op, bwd_op)) { + PADDLE_ENFORCE(matched_fwd_op == nullptr, + "Found multiple matched while ops"); + matched_fwd_op = &fwd_op; + } + } + PADDLE_ENFORCE_NOT_NULL(matched_fwd_op, + "Cannot find matched forward while op."); + ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op); + while_op_set.erase(*matched_fwd_op); + } +} + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + int block_id, + const std::vector> &all_ops) { + // If block_id is not 0, returns + // This is because all while_ops and while_grad_ops in the whole program + // would be processed when block_id is 0 (i.e. when Executor::Run() or + // ParallelExecutor constructs). + + // What's more, all while_ops and while_grad_ops must be processed when + // block_id is zero. If not, while_op may run first and erase variables + // used in while_grad_op, and in this moment, while_grad_ops may be not + // constructed yet. + if (block_id != 0) return; + + std::vector fwd_ops, bwd_ops; + for (auto &op : all_ops) { + if (op->Type() == "while") { + fwd_ops.emplace_back(op.get()); + } else if (op->Type() == "while_grad") { + bwd_ops.emplace_back(op.get()); + } + } + PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); +} + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + const std::vector &while_ops, + const std::vector &while_grad_ops) { + std::vector fwd_ops, bwd_ops; + fwd_ops.reserve(while_ops.size()); + for (auto *op : while_ops) { + fwd_ops.emplace_back(op); + } + + bwd_ops.reserve(while_grad_ops.size()); + for (auto *op : while_grad_ops) { + bwd_ops.emplace_back(op); + } + + PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..456ba8642b9bd32a1236d112cc8b387ae6a279d3 --- /dev/null +++ b/paddle/fluid/operators/controlflow/while_op_helper.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace operators { + +static constexpr char kStepBlock[] = "sub_block"; +static constexpr char kCondition[] = "Condition"; +static constexpr char kStepScopes[] = "StepScopes"; +static constexpr char kX[] = "X"; +static constexpr char kXGRAD[] = "X@GRAD"; +static constexpr char kOutputs[] = "Out"; +static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + int block_id, + const std::vector> &all_ops); + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + const std::vector &while_ops, + const std::vector &while_grad_ops); + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index f5208e7a601f4dd33b486e5840178022f66431e5..9e5ccd928e9d6012c1da3baa17521dcac0c8ff2f 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -42,6 +42,7 @@ using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; using DataLayout = platform::DataLayout; template using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; +using framework::AlgorithmsCache; template class CUDNNConvOpKernel : public framework::OpKernel { @@ -169,18 +170,8 @@ class CUDNNConvOpKernel : public framework::OpKernel { workspace_size_limit, &algo)); VLOG(3) << "cuDNN forward algo " << algo; } else if (exhaustive_search && (!half_float)) { - AlgorithmsCache* algo_cache = nullptr; - if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) { - algo_cache = - ctx.scope() - .FindVar(kCUDNNFwdAlgoCache) - ->GetMutable>(); - } else { - algo_cache = - const_cast(ctx.scope()) - .Var(kCUDNNFwdAlgoCache) - ->GetMutable>(); - } + AlgorithmsCache& algo_cache = + ctx.GetKernelConfig>(0); cudnn_workspace = ctx.AllocateTmpTensor( framework::make_ddim( @@ -188,7 +179,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { dev_ctx); cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); - algo = algo_cache->GetAlgorithm( + algo = algo_cache.GetAlgorithm( x_dims, f_dims, strides, paddings, dilations, 0, [&]() { int returned_algo_count; std::array @@ -382,22 +373,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); if (exhaustive_search) { - AlgorithmsCache* data_algo_cache; - if (ctx.scope().FindVar(kCUDNNBwdDataAlgoCache)) { - data_algo_cache = - ctx.scope() - .FindVar(kCUDNNBwdDataAlgoCache) - ->GetMutable< - AlgorithmsCache>(); - } else { - data_algo_cache = - const_cast(ctx.scope()) - .Var(kCUDNNBwdDataAlgoCache) - ->GetMutable< - AlgorithmsCache>(); - } - - data_algo = data_algo_cache->GetAlgorithm( + AlgorithmsCache& data_algo_cache = + ctx.GetKernelConfig>( + 0); + + data_algo = data_algo_cache.GetAlgorithm( x_dims, f_dims, strides, paddings, dilations, 0, [&]() { int returned_algo_count; std::array { if (filter_grad) { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); if (exhaustive_search) { - AlgorithmsCache* f_algo_cache; - if (ctx.scope().FindVar(kCUDNNBwdFilterAlgoCache)) { - f_algo_cache = - ctx.scope() - .FindVar(kCUDNNBwdFilterAlgoCache) - ->GetMutable< - AlgorithmsCache>(); - } else { - f_algo_cache = - const_cast(ctx.scope()) - .Var(kCUDNNBwdFilterAlgoCache) - ->GetMutable< - AlgorithmsCache>(); - } - - filter_algo = f_algo_cache->GetAlgorithm( + AlgorithmsCache& f_algo_cache = + ctx.GetKernelConfig< + AlgorithmsCache>(1); + + filter_algo = f_algo_cache.GetAlgorithm( x_dims, f_dims, strides, paddings, dilations, 0, [&]() { int returned_algo_count; std::array #include #include +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/cudnn_helper.h" DECLARE_uint64(conv_workspace_size_limit); @@ -46,100 +47,5 @@ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4; static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5; #endif -template -class AlgorithmsCache { - public: - AlgorithmsCache() : search_times_(0) { hash_.clear(); } - // Caches the best algorithm for a given - // combination of tensor dimensions & compute data type. - TAlgorithm GetAlgorithm( - const std::vector& dims1, const std::vector& dims2, - const std::vector& strides, const std::vector& paddings, - const std::vector& dilations, - int algorithmFlags, // can set for different data type - std::function gen_func); - - TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags, - std::function gen_func); - - private: - std::unordered_map hash_; - std::mutex mutex_; - - int search_times_; -}; - -template -TAlgorithm AlgorithmsCache::GetAlgorithm( - const std::vector& dims1, const std::vector& dims2, - const std::vector& strides, const std::vector& paddings, - const std::vector& dilations, int algorithmFlags, - std::function gen_func) { - std::lock_guard lock(mutex_); - int64_t seed = 0; - // Hash all of the inputs, use to try and look up a previously - // discovered algorithm, or fall back to generating a new one. - std::hash hashFn; - // do hash like boost - // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x - for (const auto num : dims1) { - seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2); - } - - for (const auto num : dims2) { - seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1; - } - - for (const auto num : strides) { - seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + - (seed >> 2) + 2; - } - - for (const auto num : paddings) { - seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + - (seed >> 2) + 3; - } - - for (const auto num : dilations) { - seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + - (seed >> 2) + 4; - } - - seed ^= hashFn(static_cast(algorithmFlags)) + 0x9e3779b9 + - (seed << 6) + (seed >> 2) + 5; - - if (seed == 0) return gen_func(); - - if (hash_.find(seed) == hash_.end()) { - TAlgorithm value = gen_func(); - hash_[seed] = value; - } - return hash_[seed]; -} - -template -TAlgorithm AlgorithmsCache::GetAlgorithm( - int64_t area, int search_times, int algorithmFlags, - std::function gen_func) { - if (hash_.find(area) != hash_.end()) { - return hash_[area]; - } - if (search_times_ < search_times) { - auto algo = gen_func(); - hash_[area] = algo; - ++search_times_; - return algo; - } - TAlgorithm algo; - int64_t min = static_cast(INT_MAX); - for (const auto& m : hash_) { - if (m.first < min) { - min = m.first; - algo = m.second; - } - } - return algo; -} - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index d8b997cca613f660046106512fc03bf55f9b992d..64152829b4f000e545054e528edca33dfe96ec56 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -30,6 +30,8 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; using ScopedActivationDescriptor = platform::ScopedActivationDescriptor; using DataLayout = platform::DataLayout; +using framework::AlgorithmsCache; + template using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; @@ -139,38 +141,21 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { } return fwd_perf_stat[0].algo; }; - AlgorithmsCache* algo_cache = nullptr; + AlgorithmsCache& algo_cache = + ctx.GetKernelConfig>(0); int search_times = ctx.Attr("search_times"); search_times = std::max( static_cast(FLAGS_cudnn_exhaustive_search_times), search_times); + // TODO(dangqingqing): Unify this if-else. if (search_times > 0) { // The searched algo will be cached by `search_times` times for // different input dimension. For other dimensions, select the algo // of closest area. - auto var_name = ctx.Inputs("AlgoCache")[0]; - algo_cache = - ctx.scope() - .FindVar(var_name) - ->GetMutable>(); - algo = algo_cache->GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0, - search_func); + algo = algo_cache.GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0, + search_func); } else { - // Cache searched algo in Var(kCUDNNFwdAlgoCache). - // all conv ops use the same kCUDNNFwdAlgoCache variable. - if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) { - algo_cache = - ctx.scope() - .FindVar(kCUDNNFwdAlgoCache) - ->GetMutable>(); - } else { - // TODO(qingqing) remove const_cast - algo_cache = - const_cast(ctx.scope().parent()) - ->Var(kCUDNNFwdAlgoCache) - ->GetMutable>(); - } - algo = algo_cache->GetAlgorithm(x_dims, f_dims, strides, paddings, - dilations, 0, search_func); + algo = algo_cache.GetAlgorithm(x_dims, f_dims, strides, paddings, + dilations, 0, search_func); } VLOG(3) << "choose algo " << algo; } diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index fd9f156d070bdb1990a2fc9c63305933050e5524..ca6bc4df0fe2c6cddaf548d3e708e777172a0841 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/platform/cudnn_helper.h" #endif #ifdef PADDLE_WITH_MKLDNN @@ -80,6 +81,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( framework::OpKernelType::kDefaultCustomizedTypeValue; framework::LibraryType library{framework::LibraryType::kPlain}; // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + auto input_data_type = ctx.Input("Input")->type(); std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout = framework::StringToDataLayout(data_format); @@ -93,11 +95,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( platform::CanMKLDNNBeUsed(ctx)) { library = framework::LibraryType::kMKLDNN; layout = framework::DataLayout::kMKLDNN; - customized_type_value = kConvMKLDNNFP32; + customized_type_value = + (input_data_type == framework::DataTypeTrait::DataType || + input_data_type == framework::DataTypeTrait::DataType) + ? kConvMKLDNNINT8 + : kConvMKLDNNFP32; } #endif - auto input_data_type = ctx.Input("Input")->type(); if (input_data_type != framework::proto::VarType::INT8 && input_data_type != framework::proto::VarType::UINT8) { auto filter_data_type = ctx.Input("Filter")->type(); @@ -109,8 +114,20 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( "float16 can only be used when CUDNN is used"); } - return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, - library, customized_type_value); + auto type = framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library, customized_type_value); +#ifdef PADDLE_WITH_CUDA + std::vector& configs = kernel_configs_map_[type]; + // TODO(dangqingqing): Currently conv_fusion_op use cudnn but sets use_cudnn + // to false. It should be fixed and then here should only create if library + // is kCUDNN. + if (configs.empty()) { + std::shared_ptr> p( + new framework::AlgorithmsCache()); + configs.push_back(p); + } +#endif + return type; } void Conv2DOpMaker::Make() { @@ -410,9 +427,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( } #endif - return framework::OpKernelType(ctx.Input("Input")->type(), - ctx.GetPlace(), layout_, library_, - customized_type_value); + auto type = framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_, + customized_type_value); +#ifdef PADDLE_WITH_CUDA + if (library_ == framework::LibraryType::kCUDNN) { + std::vector& configs = kernel_configs_map_[type]; + if (configs.empty()) { + std::shared_ptr> + p(new framework::AlgorithmsCache()); + configs.push_back(p); + + std::shared_ptr< + framework::AlgorithmsCache> + p2(new framework::AlgorithmsCache()); + configs.push_back(p2); + } + } +#endif + return type; } class Conv2dGradMaker : public framework::SingleGradOpDescMaker { diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index 86a140f15219001126283aa8b3f76d72fddb28fc..c994c6f642d286d9b52ada667058b064ff242ce6 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -127,6 +127,12 @@ void Conv2DTransposeOpMaker::Make() { "output feature channels," "H is the height of the filter, and W is the width of the filter. " "We enforce groups number == 1 in the convolution transpose scenario."); + AddInput("Bias", + "(Tensor) Bias to be added to each output of filter application." + "The format of output tensor is X (one-dimensional) of size equal" + "to the number of output channels. Only used with MKL-DNN.") + .AsDispensable(); + AddOutput("Output", "(Tensor) The output tensor of convolution transpose operator. " "The format of output tensor is also NCHW."); diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc index 81c9e9e543191d9b2d606217d726cc783be97fea..e053ae57739d3d96209e9ca180cc041f8b55396e 100644 --- a/paddle/fluid/operators/crf_decoding_op.cc +++ b/paddle/fluid/operators/crf_decoding_op.cc @@ -84,12 +84,12 @@ class CRFDecodingOp : public framework::OperatorWithKernel { "Output(ViterbiPath) should be not null."); auto emission_dims = ctx->GetInputDim("Emission"); - PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL, + PADDLE_ENFORCE_EQ(emission_dims.size(), 2, "The Input(Emission) should be a 2-D tensor."); PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed."); auto transition_dims = ctx->GetInputDim("Transition"); - PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL, + PADDLE_ENFORCE_EQ(transition_dims.size(), 2, "The Input(Transition) should be a 2-D tensor."); PADDLE_ENFORCE_EQ( transition_dims[0] - 2, transition_dims[1], diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h index 72774a878d98b431da05cf870139752421b2df8d..d6b54038ec5648c72d606a6c7b9c8356cb74521b 100644 --- a/paddle/fluid/operators/crf_decoding_op.h +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -82,8 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel { Tensor track; int* track_value = track.mutable_data(emission_dims, platform::CPUPlace()); - auto ker = jit::Get, - platform::CPUPlace>(tag_num); + auto ker = + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(tag_num); ker(static_cast(seq_len), x, w, alpha_value, track_value, tag_num); T max_score = -std::numeric_limits::max(); int max_i = 0; diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index 1968e54b00601139e252f0480ca3ae1fc08904f4..3adc7baebddd06ced74afea1e77017beb57582e8 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -32,14 +32,23 @@ class CrossEntropyOp : public framework::OperatorWithKernel { int rank = x_dims.size(); PADDLE_ENFORCE_EQ(rank, label_dims.size(), "Input(X) and Input(Label) shall have the same rank."); - PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), - framework::slice_ddim(label_dims, 0, rank - 1), - "Input(X) and Input(Label) shall have the same shape " - "except the last dimension."); + bool check = true; + if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 || + framework::product(label_dims) <= 0)) { + check = false; + } + if (check) { + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(label_dims, 0, rank - 1), + "Input(X) and Input(Label) shall have the same shape " + "except the last dimension."); + } if (ctx->Attrs().Get("soft_label")) { - PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1], - "If Attr(soft_label) == true, the last dimension of " - "Input(X) and Input(Label) should be equal."); + if (check) { + PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1], + "If Attr(soft_label) == true, the last dimension of " + "Input(X) and Input(Label) should be equal."); + } } else { PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1UL, "If Attr(softLabel) == false, the last dimension of " @@ -82,20 +91,32 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { "Input(Y@Grad) and Input(X) should have the same rank."); PADDLE_ENFORCE_EQ(label_dims.size(), rank, "Input(Label) and Input(X) should have the same rank."); - PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), - framework::slice_ddim(label_dims, 0, rank - 1), - "The Input(X) and Input(Label) should have the same " - "shape except the last dimension."); - PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), - framework::slice_ddim(dy_dims, 0, rank - 1), - "The Input(X) and Input(Y@Grad) should have the same " - "shape except the last dimension."); + + bool check = true; + if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 || + framework::product(label_dims) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(label_dims, 0, rank - 1), + "The Input(X) and Input(Label) should have the same " + "shape except the last dimension."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(dy_dims, 0, rank - 1), + "The Input(X) and Input(Y@Grad) should have the same " + "shape except the last dimension."); + } PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, "The last dimension of Input(Y@Grad) should be 1."); if (ctx->Attrs().Get("soft_label")) { - PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1], - "When Attr(soft_label) == true, the last dimension of " - "Input(X) and Input(Label) should be equal."); + if (check) { + PADDLE_ENFORCE_EQ( + x_dims[rank - 1], label_dims[rank - 1], + "When Attr(soft_label) == true, the last dimension of " + "Input(X) and Input(Label) should be equal."); + } } else { PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1, "When Attr(soft_label) == false, the last dimension of " diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc index d5bc25d19cba4de6f059612e3e8c4a65b2edd0f9..45bce6e5203f8c1dbb744e0f954f7f0a71c53372 100644 --- a/paddle/fluid/operators/data_norm_op.cc +++ b/paddle/fluid/operators/data_norm_op.cc @@ -140,9 +140,6 @@ class DataNormOpMaker : public framework::OpProtoAndCheckerMaker { "Scales of the history data batch, " "will apply to output when training") .AsIntermediate(); - AddAttr("use_mkldnn", - "(bool, default false) Only used in mkldnn kernel") - .SetDefault(false); AddComment(R"DOC( Data Normalization. diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index f6fbe97565c43c306ea885c765c0a665492fa317..c87837e69424335ac926bf05664e5f79940390b5 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -33,11 +33,14 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu) detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc) +detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu) if(WITH_GPU) detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub) + detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub) else() detection_library(generate_proposals_op SRCS generate_proposals_op.cc) + detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc) endif() detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu) diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cc b/paddle/fluid/operators/detection/anchor_generator_op.cc index f2984d1af2f26d901bc30ecfd519d5268a60278a..4a333b559f82e6d39d2d4345c8ad58bc8d430c69 100644 --- a/paddle/fluid/operators/detection/anchor_generator_op.cc +++ b/paddle/fluid/operators/detection/anchor_generator_op.cc @@ -85,7 +85,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker { " For instance, the anchor size of 64 means the area of this anchor " "equals to 64**2.") .AddCustomChecker([](const std::vector& anchor_sizes) { - PADDLE_ENFORCE_GT(anchor_sizes.size(), 0, + PADDLE_ENFORCE_GT(anchor_sizes.size(), 0UL, "Size of anchor_sizes must be at least 1."); for (size_t i = 0; i < anchor_sizes.size(); ++i) { PADDLE_ENFORCE_GT(anchor_sizes[i], 0.0, @@ -103,7 +103,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker { "(vector) List of variances to be used " "in box regression deltas") .AddCustomChecker([](const std::vector& variances) { - PADDLE_ENFORCE_EQ(variances.size(), 4, + PADDLE_ENFORCE_EQ(variances.size(), 4UL, "Must and only provide 4 variance."); for (size_t i = 0; i < variances.size(); ++i) { PADDLE_ENFORCE_GT(variances[i], 0.0, @@ -117,7 +117,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(std::vector(2, 16.0)) .AddCustomChecker([](const std::vector& stride) { PADDLE_ENFORCE_EQ( - stride.size(), 2, + stride.size(), 2UL, "Must and only provide 2 stride for width and height."); for (size_t i = 0; i < stride.size(); ++i) { PADDLE_ENFORCE_GT(stride[i], 0.0, diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..945d575a6446429a0ec34a603356c2c99263a776 --- /dev/null +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc @@ -0,0 +1,169 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; + +class BoxDecoderAndAssignOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("PriorBox"), + "Input(PriorBox) of BoxDecoderAndAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("PriorBoxVar"), + "Input(PriorBoxVar) of BoxDecoderAndAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("TargetBox"), + "Input(TargetBox) of BoxDecoderAndAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("BoxScore"), + "Input(BoxScore) of BoxDecoderAndAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("DecodeBox"), + "Output(DecodeBox) of BoxDecoderAndAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("OutputAssignBox"), + "Output(OutputAssignBox) of BoxDecoderAndAssignOp should not be null."); + + auto prior_box_dims = ctx->GetInputDim("PriorBox"); + auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); + auto target_box_dims = ctx->GetInputDim("TargetBox"); + auto box_score_dims = ctx->GetInputDim("BoxScore"); + + PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2, + "The rank of Input of PriorBox must be 2"); + PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]"); + PADDLE_ENFORCE_EQ(prior_box_var_dims.size(), 1, + "The rank of Input of PriorBoxVar must be 1"); + PADDLE_ENFORCE_EQ(prior_box_var_dims[0], 4, + "The shape of PriorBoxVar is [4]"); + PADDLE_ENFORCE_EQ(target_box_dims.size(), 2, + "The rank of Input of TargetBox must be 2"); + PADDLE_ENFORCE_EQ(box_score_dims.size(), 2, + "The rank of Input of BoxScore must be 2"); + PADDLE_ENFORCE_EQ(prior_box_dims[0], target_box_dims[0], + "The first dim of prior_box and target_box is roi nums " + "and should be same!"); + PADDLE_ENFORCE_EQ(prior_box_dims[0], box_score_dims[0], + "The first dim of prior_box and box_score is roi nums " + "and should be same!"); + PADDLE_ENFORCE_EQ(target_box_dims[1], box_score_dims[1] * prior_box_dims[1], + "The shape of target_box is [N, classnum * 4], The shape " + "of box_score is [N, classnum], The shape of prior_box " + "is [N, 4]"); + + ctx->SetOutputDim("DecodeBox", framework::make_ddim({target_box_dims[0], + target_box_dims[1]})); + ctx->ShareLoD("TargetBox", /*->*/ "DecodeBox"); + ctx->SetOutputDim( + "OutputAssignBox", + framework::make_ddim({prior_box_dims[0], prior_box_dims[1]})); + ctx->ShareLoD("PriorBox", /*->*/ "OutputAssignBox"); + } +}; + +class BoxDecoderAndAssignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "PriorBox", + "(Tensor, default Tensor) " + "Box list PriorBox is a 2-D Tensor with shape [N, 4] which holds N " + "boxes and each box is represented as [xmin, ymin, xmax, ymax], " + "[xmin, ymin] is the left top coordinate of the anchor box, " + "if the input is image feature map, they are close to the origin " + "of the coordinate system. [xmax, ymax] is the right bottom " + "coordinate of the anchor box."); + AddInput("PriorBoxVar", + "(Tensor, default Tensor, optional) " + "PriorBoxVar is a 2-D Tensor with shape [N, 4] which holds N " + "group of variance. PriorBoxVar will set all elements to 1 by " + "default.") + .AsDispensable(); + AddInput("TargetBox", + "(LoDTensor or Tensor) " + "This input can be a 2-D LoDTensor with shape " + "[N, classnum*4]. It holds N targets for N boxes."); + AddInput("BoxScore", + "(LoDTensor or Tensor) " + "This input can be a 2-D LoDTensor with shape " + "[N, classnum], each box is represented as [classnum] which is " + "the classification probabilities."); + AddAttr("box_clip", + "(float, default 4.135, np.log(1000. / 16.)) " + "clip box to prevent overflowing") + .SetDefault(4.135f); + AddOutput("DecodeBox", + "(LoDTensor or Tensor) " + "the output tensor of op with shape [N, classnum * 4] " + "representing the result of N target boxes decoded with " + "M Prior boxes and variances for each class."); + AddOutput("OutputAssignBox", + "(LoDTensor or Tensor) " + "the output tensor of op with shape [N, 4] " + "representing the result of N target boxes decoded with " + "M Prior boxes and variances with the best non-background class " + "by BoxScore."); + AddComment(R"DOC( + +Bounding Box Coder. + +Decode the target bounding box with the prior_box information. + +The Decoding schema is described below: + + $$ + ox = (pw \\times pxv \\times tx + px) - \\frac{tw}{2} + $$ + $$ + oy = (ph \\times pyv \\times ty + py) - \\frac{th}{2} + $$ + $$ + ow = \\exp (pwv \\times tw) \\times pw + \\frac{tw}{2} + $$ + $$ + oh = \\exp (phv \\times th) \\times ph + \\frac{th}{2} + $$ + +where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width +and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the +prior_box's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`, +`phv` denote the variance of the prior_box and `ox`, `oy`, `ow`, `oh` denote the +decoded coordinates, width and height in decode_box. + +decode_box is obtained after box decode, then assigning schema is described below: + +For each prior_box, use the best non-background class's decoded values to +update the prior_box locations and get output_assign_box. So, the shape of +output_assign_box is the same as PriorBox. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(box_decoder_and_assign, ops::BoxDecoderAndAssignOp, + ops::BoxDecoderAndAssignOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + box_decoder_and_assign, + ops::BoxDecoderAndAssignKernel, + ops::BoxDecoderAndAssignKernel); diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..25e6545eb59bde5e080dc907f9ecd4281062413f --- /dev/null +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu @@ -0,0 +1,147 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +template +__global__ void DecodeBoxKernel(const T* prior_box_data, + const T* prior_box_var_data, + const T* target_box_data, const int roi_num, + const int class_num, const T box_clip, + T* output_box_data) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < roi_num * class_num) { + int i = idx / class_num; + int j = idx % class_num; + T prior_box_width = prior_box_data[i * 4 + 2] - prior_box_data[i * 4] + 1; + T prior_box_height = + prior_box_data[i * 4 + 3] - prior_box_data[i * 4 + 1] + 1; + T prior_box_center_x = prior_box_data[i * 4] + prior_box_width / 2; + T prior_box_center_y = prior_box_data[i * 4 + 1] + prior_box_height / 2; + + int offset = i * class_num * 4 + j * 4; + T dw = prior_box_var_data[2] * target_box_data[offset + 2]; + T dh = prior_box_var_data[3] * target_box_data[offset + 3]; + if (dw > box_clip) { + dw = box_clip; + } + if (dh > box_clip) { + dh = box_clip; + } + T target_box_center_x = 0, target_box_center_y = 0; + T target_box_width = 0, target_box_height = 0; + target_box_center_x = + prior_box_var_data[0] * target_box_data[offset] * prior_box_width + + prior_box_center_x; + target_box_center_y = + prior_box_var_data[1] * target_box_data[offset + 1] * prior_box_height + + prior_box_center_y; + target_box_width = expf(dw) * prior_box_width; + target_box_height = expf(dh) * prior_box_height; + + output_box_data[offset] = target_box_center_x - target_box_width / 2; + output_box_data[offset + 1] = target_box_center_y - target_box_height / 2; + output_box_data[offset + 2] = + target_box_center_x + target_box_width / 2 - 1; + output_box_data[offset + 3] = + target_box_center_y + target_box_height / 2 - 1; + } +} + +template +__global__ void AssignBoxKernel(const T* prior_box_data, + const T* box_score_data, T* output_box_data, + const int roi_num, const int class_num, + T* output_assign_box_data) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < roi_num) { + int i = idx; + T max_score = -1; + int max_j = -1; + for (int j = 0; j < class_num; ++j) { + T score = box_score_data[i * class_num + j]; + if (score > max_score && j > 0) { + max_score = score; + max_j = j; + } + } + if (max_j > 0) { + for (int pno = 0; pno < 4; pno++) { + output_assign_box_data[i * 4 + pno] = + output_box_data[i * class_num * 4 + max_j * 4 + pno]; + } + } else { + for (int pno = 0; pno < 4; pno++) { + output_assign_box_data[i * 4 + pno] = prior_box_data[i * 4 + pno]; + } + } + } +} + +template +class BoxDecoderAndAssignCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* box_score = context.Input("BoxScore"); + auto* output_box = context.Output("DecodeBox"); + auto* output_assign_box = + context.Output("OutputAssignBox"); + + auto roi_num = target_box->dims()[0]; + auto class_num = box_score->dims()[1]; + auto* target_box_data = target_box->data(); + auto* prior_box_data = prior_box->data(); + auto* prior_box_var_data = prior_box_var->data(); + auto* box_score_data = box_score->data(); + output_box->mutable_data({roi_num, class_num * 4}, context.GetPlace()); + output_assign_box->mutable_data({roi_num, 4}, context.GetPlace()); + T* output_box_data = output_box->data(); + T* output_assign_box_data = output_assign_box->data(); + + int block = 512; + int grid = (roi_num * class_num + block - 1) / block; + auto& device_ctx = context.cuda_device_context(); + + const T box_clip = context.Attr("box_clip"); + + DecodeBoxKernel<<>>( + prior_box_data, prior_box_var_data, target_box_data, roi_num, class_num, + box_clip, output_box_data); + + context.device_context().Wait(); + int assign_grid = (roi_num + block - 1) / block; + AssignBoxKernel<<>>( + prior_box_data, box_score_data, output_box_data, roi_num, class_num, + output_assign_box_data); + context.device_context().Wait(); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + box_decoder_and_assign, + ops::BoxDecoderAndAssignCUDAKernel, + ops::BoxDecoderAndAssignCUDAKernel); diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e66a8351f4761fc805dbd2e44f237c751642d816 --- /dev/null +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h @@ -0,0 +1,103 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class BoxDecoderAndAssignKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* box_score = context.Input("BoxScore"); + auto* output_box = context.Output("DecodeBox"); + auto* output_assign_box = + context.Output("OutputAssignBox"); + int roi_num = target_box->dims()[0]; + int class_num = box_score->dims()[1]; + auto* target_box_data = target_box->data(); + auto* prior_box_data = prior_box->data(); + auto* prior_box_var_data = prior_box_var->data(); + auto* box_score_data = box_score->data(); + output_box->mutable_data({roi_num, class_num * 4}, context.GetPlace()); + output_assign_box->mutable_data({roi_num, 4}, context.GetPlace()); + T* output_box_data = output_box->data(); + T* output_assign_box_data = output_assign_box->data(); + const T bbox_clip = context.Attr("box_clip"); + + for (int i = 0; i < roi_num; ++i) { + T prior_box_width = prior_box_data[i * 4 + 2] - prior_box_data[i * 4] + 1; + T prior_box_height = + prior_box_data[i * 4 + 3] - prior_box_data[i * 4 + 1] + 1; + T prior_box_center_x = prior_box_data[i * 4] + prior_box_width / 2; + T prior_box_center_y = prior_box_data[i * 4 + 1] + prior_box_height / 2; + for (int j = 0; j < class_num; ++j) { + int64_t offset = i * class_num * 4 + j * 4; + T dw = std::min(prior_box_var_data[2] * target_box_data[offset + 2], + bbox_clip); + T dh = std::min(prior_box_var_data[3] * target_box_data[offset + 3], + bbox_clip); + T target_box_center_x = 0, target_box_center_y = 0; + T target_box_width = 0, target_box_height = 0; + target_box_center_x = + prior_box_var_data[0] * target_box_data[offset] * prior_box_width + + prior_box_center_x; + target_box_center_y = prior_box_var_data[1] * + target_box_data[offset + 1] * + prior_box_height + + prior_box_center_y; + target_box_width = std::exp(dw) * prior_box_width; + target_box_height = std::exp(dh) * prior_box_height; + + output_box_data[offset] = target_box_center_x - target_box_width / 2; + output_box_data[offset + 1] = + target_box_center_y - target_box_height / 2; + output_box_data[offset + 2] = + target_box_center_x + target_box_width / 2 - 1; + output_box_data[offset + 3] = + target_box_center_y + target_box_height / 2 - 1; + } + + T max_score = -1; + int max_j = -1; + for (int j = 0; j < class_num; ++j) { + T score = box_score_data[i * class_num + j]; + if (score > max_score && j > 0) { + max_score = score; + max_j = j; + } + } + + if (max_j > 0) { + for (int pno = 0; pno < 4; pno++) { + output_assign_box_data[i * 4 + pno] = + output_box_data[i * class_num * 4 + max_j * 4 + pno]; + } + } else { + for (int pno = 0; pno < 4; pno++) { + output_assign_box_data[i * 4 + pno] = prior_box_data[i * 4 + pno]; + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6d36876efd747d9e6f90c0d0200a9e9610a5318c --- /dev/null +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc @@ -0,0 +1,93 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h" + +namespace paddle { +namespace operators { + +class DistributeFpnProposalsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("FpnRois"), + "Input(FpnRois) shouldn't be null"); + PADDLE_ENFORCE_GE( + ctx->Outputs("MultiFpnRois").size(), 1UL, + "Outputs(MultiFpnRois) of DistributeOp should not be empty"); + size_t min_level = static_cast(ctx->Attrs().Get("min_level")); + size_t max_level = static_cast(ctx->Attrs().Get("max_level")); + PADDLE_ENFORCE_GE(max_level, min_level, + "max_level must not lower than min_level"); + // Set the output shape + size_t num_out_rois = max_level - min_level + 1; + std::vector outs_dims; + outs_dims.reserve(num_out_rois); + for (size_t i = 0; i < num_out_rois; ++i) { + framework::DDim out_dim = {-1, 4}; + outs_dims.push_back(out_dim); + } + ctx->SetOutputsDim("MultiFpnRois", outs_dims); + ctx->SetOutputDim("RestoreIndex", {1, -1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("FpnRois")); + return framework::OpKernelType(data_type, platform::CPUPlace()); + } +}; + +class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("FpnRois", "(LoDTensor) The rois at all levels in shape (-1, 4)"); + AddOutput("MultiFpnRois", "(LoDTensor) Output with distribute operator") + .AsDuplicable(); + AddOutput("RestoreIndex", + "(Tensor) An array of positive number which is " + "used to restore the order of FpnRois"); + AddAttr("min_level", + "The lowest level of FPN layer where the" + " proposals come from"); + AddAttr("max_level", + "The highest level of FPN layer where the" + " proposals come from"); + AddAttr("refer_level", + "The referring level of FPN layer with" + " specified scale"); + AddAttr("refer_scale", + "The referring scale of FPN layer with" + " specified level"); + AddComment(R"DOC( +This operator distribute all proposals into different fpn level, + with respect to scale of the proposals, the referring scale and + the referring level. Besides, to restore the order of proposals, +we return an array which indicate the original index of rois in + current proposals. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(distribute_fpn_proposals, ops::DistributeFpnProposalsOp, + ops::DistributeFpnProposalsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(distribute_fpn_proposals, + ops::DistributeFpnProposalsOpKernel, + ops::DistributeFpnProposalsOpKernel); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..9cbb969158386547485fad54120510595eb92804 --- /dev/null +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -0,0 +1,221 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "cub/cub.cuh" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h" +#include "paddle/fluid/operators/gather.cu.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +int const BBoxSize = 4; + +struct RangeInitFunctor { + int start_; + int delta_; + int* out_; + __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; } +}; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +static inline void TransLoD(const int* length_lod, const int lod_size, + int* offset_lod) { + int offset = 0; + for (int i = 0; i < lod_size; ++i) { + offset_lod[i] = offset; + offset += length_lod[i]; + } +} + +template +static __device__ inline T RoIArea(const T* box, bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static __global__ void GPUDistFpnProposalsHelper( + const int nthreads, const T* rois, const int lod_size, + const int refer_level, const int refer_scale, const int max_level, + const int min_level, int* roi_batch_id_data, int* sub_lod_list, + int* target_lvls) { + CUDA_1D_KERNEL_LOOP(i, nthreads) { + const T* offset_roi = rois + i * BBoxSize; + int roi_batch_ind = roi_batch_id_data[i]; + // get the target level of current rois + T roi_area = RoIArea(offset_roi, false); + T roi_scale = sqrt(roi_area); + int tgt_lvl = floor(log2(roi_scale / refer_scale) + refer_level); + tgt_lvl = min(max_level, max(tgt_lvl, min_level)); + target_lvls[i] = tgt_lvl; + // compute number of rois in the same batch and same target level + platform::CudaAtomicAdd(sub_lod_list + tgt_lvl * lod_size + roi_batch_ind, + 1); + } +} + +template +class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* fpn_rois = ctx.Input("FpnRois"); + + auto multi_fpn_rois = ctx.MultiOutput("MultiFpnRois"); + auto* restore_index = ctx.Output("RestoreIndex"); + + const int min_level = ctx.Attr("min_level"); + const int max_level = ctx.Attr("max_level"); + const int refer_level = ctx.Attr("refer_level"); + const int refer_scale = ctx.Attr("refer_scale"); + int num_level = max_level - min_level + 1; + + // check that the fpn_rois is not empty + PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL, + "DistributeFpnProposalsOp need 1 level of LoD"); + + auto fpn_rois_lod = fpn_rois->lod().back(); + int lod_size = fpn_rois_lod.size() - 1; + int roi_num = fpn_rois_lod[lod_size]; + + auto& dev_ctx = ctx.template device_context(); + + // get batch id by lod in CPU + Tensor roi_batch_id_list; + roi_batch_id_list.Resize({roi_num}); + int* roi_batch_id_data = + roi_batch_id_list.mutable_data(platform::CPUPlace()); + for (int n = 0; n < lod_size; ++n) { + for (size_t i = fpn_rois_lod[n]; i < fpn_rois_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + // copy batch id list to GPU + Tensor roi_batch_id_list_gpu; + framework::TensorCopySync(roi_batch_id_list, dev_ctx.GetPlace(), + &roi_batch_id_list_gpu); + + Tensor sub_lod_list; + sub_lod_list.Resize({num_level, lod_size}); + int* sub_lod_list_data = sub_lod_list.mutable_data(dev_ctx.GetPlace()); + Tensor target_lvls; + target_lvls.Resize({roi_num}); + int* target_lvls_data = target_lvls.mutable_data(dev_ctx.GetPlace()); + + int blocks = NumBlocks(roi_num); + int threads = kNumCUDAThreads; + + // get target levels and sub_lod list + GPUDistFpnProposalsHelper<<>>( + roi_num, fpn_rois->data(), lod_size, refer_level, refer_scale, + max_level, min_level, roi_batch_id_list_gpu.data(), + sub_lod_list_data, target_lvls_data); + + Tensor index_in_t; + int* idx_in = index_in_t.mutable_data({roi_num}, dev_ctx.GetPlace()); + platform::ForRange for_range(dev_ctx, roi_num); + for_range(RangeInitFunctor{0, 1, idx_in}); + + Tensor keys_out_t; + int* keys_out = keys_out_t.mutable_data({roi_num}, dev_ctx.GetPlace()); + Tensor index_out_t; + int* idx_out = index_out_t.mutable_data({roi_num}, dev_ctx.GetPlace()); + + // Determine temporary device storage requirements + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairsDescending( + nullptr, temp_storage_bytes, target_lvls_data, keys_out, idx_in, + idx_out, roi_num); + // Allocate temporary storage + auto place = boost::get(dev_ctx.GetPlace()); + auto d_temp_storage = memory::Alloc(place, temp_storage_bytes, + memory::Allocator::kScratchpad); + + // Run sorting operation + // sort target level to get corresponding index + cub::DeviceRadixSort::SortPairsDescending( + d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out, + idx_in, idx_out, roi_num); + + int* restore_idx_data = + restore_index->mutable_data({roi_num, 1}, dev_ctx.GetPlace()); + // sort current index to get restore index + cub::DeviceRadixSort::SortPairsDescending( + d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in, + restore_idx_data, roi_num); + + Tensor offset_lod; + int* offset_lod_data = + offset_lod.mutable_data({lod_size + 1}, dev_ctx.GetPlace()); + for (int i = 0; i < num_level; ++i) { + Tensor sub_lod = sub_lod_list.Slice(i, i + 1); + int* sub_lod_data = sub_lod.data(); + // transfer length-based lod to offset-based lod + TransLoD(sub_lod_data, lod_size + 1, offset_lod_data); + int sub_rois_num = offset_lod_data[lod_size]; + Tensor sub_idx = index_out_t.Slice(0, sub_rois_num); + + multi_fpn_rois[i]->mutable_data({sub_rois_num, kBoxDim}, + dev_ctx.GetPlace()); + + GPUGather(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]); + framework::LoD lod; + std::vector offset; + memory::Copy(platform::CPUPlace(), offset.data(), place, offset_lod_data, + sizeof(int) * (lod_size + 1), 0); + lod.emplace_back(offset); + multi_fpn_rois[i]->set_lod(lod); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + distribute_fpn_proposals, + ops::GPUDistributeFpnProposalsOpKernel, + ops::GPUDistributeFpnProposalsOpKernel); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f63e856626d64ec13476c3f967a085624a007c3a --- /dev/null +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h @@ -0,0 +1,147 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +const int kBoxDim = 4; + +template +static inline T BBoxArea(const T* box, bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +class DistributeFpnProposalsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* fpn_rois = context.Input("FpnRois"); + + auto multi_fpn_rois = + context.MultiOutput("MultiFpnRois"); + + auto* restore_index = + context.Output("RestoreIndex"); + + const int min_level = context.Attr("min_level"); + const int max_level = context.Attr("max_level"); + const int refer_level = context.Attr("refer_level"); + const int refer_scale = context.Attr("refer_scale"); + const int num_level = max_level - min_level + 1; + + // check that the fpn_rois is not empty + PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL, + "DistributeFpnProposalsOp need 1 level of LoD"); + + auto fpn_rois_lod = fpn_rois->lod().back(); + int fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1]; + std::vector target_level; + // std::vector target_level(fpn_rois_num, -1); + // record the number of rois in each level + std::vector num_rois_level(num_level, 0); + std::vector num_rois_level_integral(num_level + 1, 0); + for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) { + Tensor fpn_rois_slice = + fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]); + const T* rois_data = fpn_rois_slice.data(); + for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) { + // get the target level of current rois + T roi_scale = std::sqrt(BBoxArea(rois_data, false)); + int tgt_lvl = + std::floor(std::log2(roi_scale / refer_scale) + refer_level); + tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level)); + target_level.push_back(tgt_lvl); + num_rois_level[tgt_lvl - min_level]++; + rois_data += kBoxDim; + } + } + // define the output rois + // pointer which point to each level fpn rois + std::vector multi_fpn_rois_data(num_level); + // lod0 which will record the offset information of each level rois + std::vector> multi_fpn_rois_lod0; + for (int i = 0; i < num_level; ++i) { + // allocate memory for each level rois + multi_fpn_rois[i]->mutable_data({num_rois_level[i], kBoxDim}, + context.GetPlace()); + multi_fpn_rois_data[i] = multi_fpn_rois[i]->data(); + std::vector lod0(1, 0); + multi_fpn_rois_lod0.push_back(lod0); + // statistic start point for each level rois + num_rois_level_integral[i + 1] = + num_rois_level_integral[i] + num_rois_level[i]; + } + restore_index->mutable_data({1, fpn_rois_num}, context.GetPlace()); + int* restore_index_data = restore_index->data(); + std::vector restore_index_inter(fpn_rois_num, -1); + // distribute the rois into different fpn level by target level + for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) { + Tensor fpn_rois_slice = + fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]); + const T* rois_data = fpn_rois_slice.data(); + size_t cur_offset = fpn_rois_lod[i]; + // std::vector lod_offset[num_level]; + for (int j = 0; j < num_level; j++) { + multi_fpn_rois_lod0[j].push_back(multi_fpn_rois_lod0[j][i]); + } + for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) { + int lvl = target_level[cur_offset + j]; + memcpy(multi_fpn_rois_data[lvl - min_level], rois_data, + kBoxDim * sizeof(T)); + multi_fpn_rois_data[lvl - min_level] += kBoxDim; + int index_in_shuffle = num_rois_level_integral[lvl - min_level] + + multi_fpn_rois_lod0[lvl - min_level][i + 1]; + restore_index_inter[index_in_shuffle] = cur_offset + j; + multi_fpn_rois_lod0[lvl - min_level][i + 1]++; + rois_data += kBoxDim; + } + } + for (int i = 0; i < fpn_rois_num; ++i) { + restore_index_data[restore_index_inter[i]] = i; + } + // merge lod information into LoDTensor + for (int i = 0; i < num_level; ++i) { + framework::LoD lod; + lod.emplace_back(multi_fpn_rois_lod0[i]); + multi_fpn_rois[i]->set_lod(lod); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h index f84405664596ebe25983e5acbbb82bfc18c38124..d3e26256b50f2d7010fee3738802d59173678b34 100644 --- a/paddle/fluid/operators/detection/prior_box_op.h +++ b/paddle/fluid/operators/detection/prior_box_op.h @@ -172,6 +172,10 @@ class PriorBoxOpKernel : public framework::OpKernel { framework::make_ddim({1, static_cast(variances.size())}), ctx.GetPlace()); auto var_et = framework::EigenTensor::From(var_t); + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif for (size_t i = 0; i < variances.size(); ++i) { var_et(0, i) = variances[i]; } @@ -181,8 +185,15 @@ class PriorBoxOpKernel : public framework::OpKernel { vars->Resize({box_num, static_cast(variances.size())}); auto e_vars = framework::EigenMatrix::From(*vars); - e_vars = var_et.broadcast(Eigen::DSizes(box_num, 1)); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(2) +#endif + for (int i = 0; i < box_num; ++i) { + for (int j = 0; j < variances.size(); ++j) { + e_vars(i, j) = variances[j]; + } + } vars->Resize(var_dim); } }; // namespace operators diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc index 2a69ad4b53c26f5e2e0547e75e0d9c6518a8bcba..ab01bdf7ca8c5a369bd8838b1acc734364666992 100644 --- a/paddle/fluid/operators/detection/yolov3_loss_op.cc +++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc @@ -144,34 +144,40 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { "The ignore threshold to ignore confidence loss.") .SetDefault(0.7); AddComment(R"DOC( - This operator generate yolov3 loss by given predict result and ground + This operator generates yolov3 loss based on given predict result and ground truth boxes. The output of previous network is in shape [N, C, H, W], while H and W - should be the same, specify the grid size, each grid point predict given - number boxes, this given number is specified by anchors, it should be - half anchors length, which following will be represented as S. In the - second dimention(the channel dimention), C should be S * (class_num + 5), - class_num is the box categoriy number of source dataset(such as coco), - so in the second dimention, stores 4 box location coordinates x, y, w, h - and confidence score of the box and class one-hot key of each anchor box. + should be the same, H and W specify the grid size, each grid point predict + given number boxes, this given number, which following will be represented as S, + is specified by the number of anchors, In the second dimension(the channel + dimension), C should be equal to S * (class_num + 5), class_num is the object + category number of source dataset(such as 80 in coco dataset), so in the + second(channel) dimension, apart from 4 box location coordinates x, y, w, h, + also includes confidence score of the box and class one-hot key of each anchor box. - While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions - correspnd to: + Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions + should be as follows: $$ - b_x = \sigma(t_x) + c_x - b_y = \sigma(t_y) + c_y + b_x = \\sigma(t_x) + c_x + $$ + $$ + b_y = \\sigma(t_y) + c_y + $$ + $$ b_w = p_w e^{t_w} + $$ + $$ b_h = p_h e^{t_h} $$ - While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$ - is specified by anchors. + In the equation above, :math:`c_x, c_y` is the left top corner of current grid + and :math:`p_w, p_h` is specified by anchors. As for confidence score, it is the logistic regression value of IoU between anchor boxes and ground truth boxes, the score of the anchor box which has - the max IoU should be 1, and if the anchor box has IoU bigger then ignore + the max IoU should be 1, and if the anchor box has IoU bigger than ignore thresh, the confidence score loss of this anchor box will be ignored. Therefore, the yolov3 loss consist of three major parts, box location loss, @@ -186,13 +192,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { In order to trade off box coordinate losses between big boxes and small boxes, box coordinate losses will be mutiplied by scale weight, which is - calculated as follow. + calculated as follows. $$ weight_{box} = 2.0 - t_w * t_h $$ - Final loss will be represented as follow. + Final loss will be represented as follows. $$ loss = (loss_{xy} + loss_{wh}) * weight_{box} diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc index b8e63f42e2040730ac79c57651d86d9e3176fa01..a1a3443348129b5cdf057592fced8fdff238ac09 100644 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc @@ -80,7 +80,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep, google::protobuf::Closure* done = brpc::NewCallback( &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - platform::RecordRPCEvent record_event(method, p_ctx); + platform::RecordRPCEvent record_event(method); ch_ctx->stub->SendVariable(cntl, &request, response, done); @@ -184,7 +184,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, google::protobuf::Closure* done = brpc::NewCallback( &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - platform::RecordRPCEvent record_event(method, p_ctx); + platform::RecordRPCEvent record_event(method); if (method_name == kGetMonomerRPC) { ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done); @@ -272,7 +272,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, &cntl->request_attachment(), out_var_name_val, false, 0, table_name_val); - platform::RecordRPCEvent record_event(method, p_ctx); + platform::RecordRPCEvent record_event(method); google::protobuf::Closure* done = brpc::NewCallback( &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); @@ -311,7 +311,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep, VarHandlePtr var_h( new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); - platform::RecordRPCEvent record_event(method, nullptr); + platform::RecordRPCEvent record_event(method); google::protobuf::Closure* done = brpc::NewCallback( &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); @@ -406,7 +406,7 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage( sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); cntl->set_timeout_ms(time_out); - platform::RecordRPCEvent record_event(method_name, nullptr); + platform::RecordRPCEvent record_event(method_name); VarHandlePtr var_h( new VarHandle(ep, method_name, req.varname(), nullptr, nullptr)); diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc index 52310f8d04db6a5df9967c0a5ec9a5e95a24cdab..61e94dae3c7a107e10fa5e5518651014cec078bc 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc @@ -89,7 +89,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, // stub context s->response_call_back_ = nullptr; - platform::RecordRPCEvent record_event(method, p_ctx); + platform::RecordRPCEvent record_event(method); auto call = s->stub_g_.PrepareUnaryCall( s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_); @@ -184,7 +184,7 @@ VarHandlePtr GRPCClient::_AsyncGetVar( // stub context s->response_call_back_ = ProcGetResponse; - platform::RecordRPCEvent record_event(method, p_ctx); + platform::RecordRPCEvent record_event(method); auto call = s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); @@ -235,7 +235,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, // stub context s->response_call_back_ = ProcGetResponse; - platform::RecordRPCEvent record_event(method, p_ctx); + platform::RecordRPCEvent record_event(method); auto call = s->stub_g_.PrepareUnaryCall( s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, @@ -265,7 +265,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, sendrecv::VariableMessage req; req.set_varname(BATCH_BARRIER_MESSAGE); - platform::RecordRPCEvent record_event(method, nullptr); + platform::RecordRPCEvent record_event(method); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); @@ -290,7 +290,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, sendrecv::VariableMessage req; req.set_varname(FETCH_BARRIER_MESSAGE); - platform::RecordRPCEvent record_event(method, nullptr); + platform::RecordRPCEvent record_event(method); auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); @@ -317,7 +317,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, sendrecv::VariableMessage req; req.set_varname(var_name); - platform::RecordRPCEvent record_event(method, nullptr); + platform::RecordRPCEvent record_event(method); auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); @@ -342,7 +342,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, sendrecv::VariableMessage req; req.set_varname(COMPLETE_MESSAGE); - platform::RecordRPCEvent record_event(method, nullptr); + platform::RecordRPCEvent record_event(method); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); @@ -372,7 +372,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, req.set_varname(CHECKPOINT_SAVE_MESSAGE); req.set_out_varname(dir); - platform::RecordRPCEvent record_event(method, nullptr); + platform::RecordRPCEvent record_event(method); auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc index 6df4fd36f95b127a0bbc0725b83c4494b160785f..6e65aa5fae83536d229be63fbaf7874bd45f967d 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc @@ -38,7 +38,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ::grpc::ByteBuffer* msg, const std::string& out_name, const int trainer_id, const std::string& table_name) { - platform::RecordRPCEvent record_event("serial", &ctx); + platform::RecordRPCEvent record_event("serial"); VarMsg request; TensorPayload* payload = nullptr; @@ -147,7 +147,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, const platform::DeviceContext& ctx, const framework::Scope* scope, framework::Variable** var, int* trainer_id) { - platform::RecordRPCEvent record_event("deserial", &ctx); + platform::RecordRPCEvent record_event("deserial"); operators::distributed::GRPCVariableResponse resp(scope, &ctx); PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!"); *var = resp.GetVar(); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 6a6741d8fc54d22addca91b75dfabf5950c1a35a..7aaa607f1585c98fe2dd816e8d66e5c6fd171e80 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -77,8 +77,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { } else { functor.RunMidWise(n, pre, post); } - z->set_layout(DataLayout::kMKLDNN); - z->set_format(x->format()); + z->set_mkldnn_prim_desc(x->get_mkldnn_prim_desc()); } else { PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN && x->format() != memory::format::format_undef, @@ -116,7 +115,8 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd); // create mkldnn memory for dst - memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data); + auto dst_mem_pd = sum_pd.dst_primitive_desc(); + memory dst_memory = memory(dst_mem_pd, z_data); std::vector inputs; inputs.push_back(srcs[0]); @@ -129,9 +129,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { pipeline.push_back(sum_prim); stream(stream::kind::eager).submit(pipeline).wait(); - z->set_layout(DataLayout::kMKLDNN); - z->set_format( - (memory::format)dst_memory.get_primitive_desc().desc().data.format); + z->set_mkldnn_prim_desc(dst_mem_pd); } } }; @@ -152,24 +150,19 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { auto* out = dout; auto *x = dout, *y = dout; - auto set_mkldnn_format = [](Tensor* in, const Tensor* out) { - in->set_layout(DataLayout::kMKLDNN); - in->set_format(out->format()); - }; - if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) { if (dx->dims() == dy->dims()) { auto blas = math::GetBlas(ctx); if (dx) { blas.VCOPY(dout->numel(), dout->data(), dx->mutable_data(ctx.GetPlace())); - set_mkldnn_format(dx, dout); + dx->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc()); } if (dy) { blas.VCOPY(dout->numel(), dout->data(), dy->mutable_data(ctx.GetPlace())); - set_mkldnn_format(dy, dout); + dy->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc()); } } } else { diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index 04e8800bbc888540c4df21360c767688eb19c423..f2f4d3fee053a1e5bacd3c2165dba960f3befea4 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -110,8 +110,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { constexpr int simd_width = 16; int C = c / simd_width; - auto multiply = jit::Get, - platform::CPUPlace>(0); + auto multiply = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(0); #pragma omp parallel for collapse(2) for (int ni = 0; ni < n; ni++) { for (int ci = 0; ci < C; ci++) { diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc index 5d6488c67e0db440c8d4609736523643dd666dcc..68c7227e5a7123e1e751dd55e243ee481bf36540 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cc +++ b/paddle/fluid/operators/fake_dequantize_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fake_dequantize_op.h" #include +#include namespace paddle { namespace operators { @@ -76,6 +77,63 @@ $$Out = \frac{scale*X}{ max_range }$$ } }; +class FakeChannelWiseDequantizeMaxAbsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("X"), + "Input(X) of FakeChannelWiseDequantizeMaxAbsOp should not be null."); + PADDLE_ENFORCE(ctx->HasInputs("Scales"), + "Input(Scales) of FakeChannelWiseDequantizeMaxAbsOp " + "should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FakeChannelWiseDequantizeMaxAbsOp should not be null."); + + ctx->ShareDim("X", /*->*/ "Out"); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class FakeChannelWiseDequantizeMaxAbsOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor) The input with float-32/64 type is the " + "low precision tensor."); + AddInput("Scales", + "(Tensors) The scales in quantization stage. " + "Now, `Scales` is a vector with at most two tensors. " + "If Scales has two elements, the second tensor should only have " + "one value.") + .AsDuplicable(); + AddOutput("Out", + "(Tensor) The output is the dequantized high " + "precision tensor."); + AddAttr>( + "quant_bits", + "Quantization bit numbers in quantization stage. " + "The size of `quant_bits` should be equal to the size of `Scales`.") + .SetDefault({8}); + + AddComment(R"DOC( +FakeChannelWiseDequantizeMaxAbsOp operator. + +This calculation is an opposite operation of FakeChannelWiseQuantizeMaxAbsOp: + +$$Out_c = \frac{X_c\prod_{i=1}^{n}Scales_{ic}}{\prod_{i=1}^{n}(2^{quant\_bits_i-1}-1)}$$ + +In the above formula, the range value of $c$ can be represented as $0 \leq c \lt \ the\ channel\ number\ of\ X$. +Besides, the size of $quant\_bits$ should be equal to the size of $Scales$, and it is called $n$ in the formula. + +Notes: In general, the per-channel quantization is only applied to weights and the activations use per-layer quantization. +)DOC"); + } +}; + } // namespace operators } // namespace paddle @@ -88,3 +146,11 @@ REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp, REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsKernel, ops::FakeDequantizeMaxAbsKernel); + +REGISTER_OPERATOR(fake_channel_wise_dequantize_max_abs, + ops::FakeChannelWiseDequantizeMaxAbsOp, + ops::FakeChannelWiseDequantizeMaxAbsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(fake_channel_wise_dequantize_max_abs, + ops::FakeChannelWiseDequantizeMaxAbsKernel, + ops::FakeChannelWiseDequantizeMaxAbsKernel); diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu index 225bcc45bc65bc9268d1e866a4358731eaf0c3ef..35dcc69279d0119e75c4c5072e7817c839b9e819 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cu +++ b/paddle/fluid/operators/fake_dequantize_op.cu @@ -55,3 +55,7 @@ using CUDA = paddle::platform::CUDADeviceContext; REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsKernel, ops::FakeDequantizeMaxAbsKernel); +REGISTER_OP_CUDA_KERNEL( + fake_channel_wise_dequantize_max_abs, + ops::FakeChannelWiseDequantizeMaxAbsKernel, + ops::FakeChannelWiseDequantizeMaxAbsKernel); diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h index d9923a10daa01ca06ebabb27cf9285b0628634bc..d05f2038531bbe9c35da54c94d2ef4d659acca70 100644 --- a/paddle/fluid/operators/fake_dequantize_op.h +++ b/paddle/fluid/operators/fake_dequantize_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -45,5 +46,42 @@ class FakeDequantizeMaxAbsKernel : public framework::OpKernel { } }; +template +class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel { + public: + virtual void Compute(const framework::ExecutionContext& ctx) const { + auto* in = ctx.Input("X"); + auto scales = ctx.MultiInput("Scales"); + auto* out = ctx.Output("Out"); + + PADDLE_ENFORCE_EQ(scales[0]->numel(), in->dims()[0], + "The number of first scale values must be the same with " + "first dimension value of Input(X)."); + + auto quant_bits = ctx.Attr>("quant_bits"); + int max_range = std::pow(2, quant_bits[0] - 1) - 1; + + auto& dev_ctx = ctx.template device_context(); + out->mutable_data(dev_ctx.GetPlace()); + + auto dequant = DequantizeFunctor(); + for (int64_t i = 0; i < in->dims()[0]; i++) { + framework::Tensor one_channel_in = in->Slice(i, i + 1); + framework::Tensor one_channel_out = out->Slice(i, i + 1); + framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1); + dequant(dev_ctx, &one_channel_in, &one_channel_scale, + static_cast(max_range), &one_channel_out); + } + + if (scales.size() == 2) { + PADDLE_ENFORCE_EQ( + scales[1]->numel(), 1, + "The second scale tensor should only have one value at now."); + max_range = std::pow(2, quant_bits[1] - 1) - 1; + dequant(dev_ctx, out, scales[1], static_cast(max_range), out); + } + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index d51eb054a96d27f6ce87ba4b4e717f49dcd8a588..70186e5efa29b1324ff7f3954720276156fddaf1 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -31,7 +31,7 @@ template struct FindAbsMaxFunctor { void operator()(const platform::CPUDeviceContext& ctx, const T* in, const int num, T* out) { - *out = *(std::max_element(in + 0, in + num, Compare())); + *out = std::abs(*(std::max_element(in + 0, in + num, Compare()))); } }; @@ -46,10 +46,8 @@ struct ClipAndFakeQuantFunctor { platform::Transform trans; trans(ctx, in.data(), in.data() + in.numel(), out->mutable_data(ctx.GetPlace()), ClipFunctor(-s, s)); - auto in_e = framework::EigenVector::Flatten(in); auto out_e = framework::EigenVector::Flatten(*out); - - out_e.device(*ctx.eigen_device()) = (bin_cnt / s * in_e).round(); + out_e.device(*ctx.eigen_device()) = (bin_cnt / s * out_e).round(); } }; @@ -136,6 +134,60 @@ $$Out = round(X/scale * range)$$ } }; +class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of FakeChannelWiseQuantizeOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FakeChannelWiseQuantizeOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("OutScales"), + "Output(Scales) of FakeChannelWiseQuantizeOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->SetOutputDim("OutScales", {ctx->GetInputDim("X")[0]}); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); + } +}; + +class FakeChannelWiseQuantizeAbsMaxOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor) Input is float data type."); + AddOutput("Out", + "(Tensor) Output of quantized low level tensor, " + "but also saved as float data type."); + AddOutput("OutScales", "(Tensor) Current channel wise scale"); + AddAttr("bit_length", "(int, default 8)") + .SetDefault(8) + .AddCustomChecker([](const int& bit_length) { + PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, + "'bit_length' should be between 1 and 16."); + }); + AddComment(R"DOC( +The scale of FakeChannelWiseQuantize operator is a vector. +In detail, each channel of the input X has a scale value. + +$$scale_c = max(abs(X_c))$$ +$$range = 2^{bit\_length - 1} - 1$$ +$$Out_c = round(\frac{X_c * range} {scale_c})$$ +In above three formulas, the range value of c is as follow: +$$0 \leq c \lt \ the\ channel\ number\ of\ X$$ +)DOC"); + } +}; + class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel { public: FakeQuantizeRangeAbsMaxOp(const std::string& type, @@ -220,3 +272,10 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp, paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxKernel); + +REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max, + ops::FakeChannelWiseQuantizeAbsMaxOp, + ops::FakeChannelWiseQuantizeAbsMaxOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max, + ops::FakeChannelWiseQuantizeAbsMaxKernel); diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index a0ff6396210c2b3a7f8bd6b9f274b875d7fd4933..5da16a7c7314c62034bff67bcc8d099e2799c3de 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -174,5 +174,7 @@ namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max, ops::FakeQuantizeAbsMaxKernel); +REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max, + ops::FakeChannelWiseQuantizeAbsMaxKernel); REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxKernel); diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h index 7ace7573ec5c03ab8788cfc0aab614b7f80ea073..8b47600e7d99ad9e4e40ae162582d4c8461224ad 100644 --- a/paddle/fluid/operators/fake_quantize_op.h +++ b/paddle/fluid/operators/fake_quantize_op.h @@ -63,6 +63,39 @@ class FakeQuantizeAbsMaxKernel : public framework::OpKernel { } }; +template +class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + + auto* out = context.Output("Out"); + auto* out_scales = context.Output("OutScales"); + T* out_scales_data = out_scales->mutable_data(context.GetPlace()); + out->mutable_data(context.GetPlace()); + + int bit_length = context.Attr("bit_length"); + int bin_cnt = std::pow(2, bit_length - 1) - 1; + + auto& dev_ctx = context.template device_context(); + auto find_abs_max = FindAbsMaxFunctor(); + for (int64_t i = 0; i < in->dims()[0]; i++) { + framework::Tensor one_channel = in->Slice(i, i + 1); + const T* one_channel_data = one_channel.data(); + find_abs_max(dev_ctx, one_channel_data, one_channel.numel(), + &out_scales_data[i]); + } + auto clip_quant = ClipAndFakeQuantFunctor(); + for (int64_t i = 0; i < in->dims()[0]; i++) { + framework::Tensor one_channel_in = in->Slice(i, i + 1); + framework::Tensor one_channel_out = out->Slice(i, i + 1); + framework::Tensor one_channel_scale = out_scales->Slice(i, i + 1); + clip_quant(dev_ctx, one_channel_in, one_channel_scale, bin_cnt, + &one_channel_out); + } + } +}; + template class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index 38e57a41ed253eab4d0713af8bb14bac19041f6d..eb4617a9359353820fc41b9ad1c8db5327fdacde 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -47,7 +47,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4, "Fully Connected input should be 2-D or 4-D tensor."); } - PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, + PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Fully Connected input should be 2-D tensor."); int in_num_col_dims = ctx->Attrs().Get("in_num_col_dims"); PADDLE_ENFORCE_GT( diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc index fe4c73f4723355d4b56d075423de29b45b9cd4e4..80caf70b08e65932d6ccb90a5293d072b2b2bc72 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc @@ -23,6 +23,9 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { + if (ctx->IsRuntime()) { + return; + } PADDLE_ENFORCE(ctx->HasInput("W"), "Input W of FusedEmbeddingSeqPoolOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Ids"), @@ -42,36 +45,15 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { // we only support sum now PADDLE_ENFORCE_EQ(combiner, "sum"); - int64_t last_dim = table_dims[1]; - for (int i = 1; i != ids_dims.size(); ++i) { - last_dim *= ids_dims[i]; - } - - if (ctx->IsRuntime()) { - framework::Variable* ids_var = - boost::get(ctx->GetInputVarPtrs("Ids")[0]); - const auto& ids_lod = ids_var->Get().lod(); + int64_t last_dim = FusedEmbeddingSeqPoolLastDim(table_dims, ids_dims); + // in compile time, the lod level of ids must be 1 + framework::VarDesc* ids_desc = + boost::get(ctx->GetInputVarPtrs("Ids")[0]); + PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1); - // in run time, the LoD of ids must be 1 - PADDLE_ENFORCE(ids_lod.size(), 1u, - "The LoD level of Input(Ids) must be 1"); - PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty"); - - int64_t batch_size = ids_lod[0].size() - 1; - - // in run time, the shape from Ids -> output - // should be [seq_length, 1] -> [batch_size, embedding_size] - ctx->SetOutputDim("Out", framework::make_ddim({batch_size, last_dim})); - } else { - // in compile time, the lod level of ids must be 1 - framework::VarDesc* ids_desc = - boost::get(ctx->GetInputVarPtrs("Ids")[0]); - PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1); - - // in compile time, the shape from Ids -> output - // should be [-1, 1] -> [-1, embedding_size] - ctx->SetOutputDim("Out", framework::make_ddim({-1, last_dim})); - } + // in compile time, the shape from Ids -> output + // should be [-1, 1] -> [-1, embedding_size] + ctx->SetOutputDim("Out", framework::make_ddim({-1, last_dim})); } protected: diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index 758432fd9e4197302e0bd8f76a1ca7c524026a70..5e2e336e7117cc4816a52405b7bc2689bc03dd46 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/jit/kernels.h" namespace paddle { namespace operators { @@ -37,36 +37,39 @@ struct EmbeddingVSumFunctor { const LoDTensor *table_t, const LoDTensor *ids_t, LoDTensor *output_t) { auto *table = table_t->data(); - int64_t row_number = table_t->dims()[0]; - int64_t row_width = table_t->dims()[1]; - int64_t last_dim = output_t->dims()[1]; + int64_t table_height = table_t->dims()[0]; + int64_t table_width = table_t->dims()[1]; + int64_t out_width = output_t->dims()[1]; const int64_t *ids = ids_t->data(); auto ids_lod = ids_t->lod()[0]; - int64_t ids_count = ids_t->numel() / ids_lod.back(); - + int64_t idx_width = ids_t->numel() / ids_lod.back(); auto *output = output_t->mutable_data(context.GetPlace()); - auto blas = math::GetBlas(context); - for (int64_t i = 0; i != ids_lod.size() - 1; ++i) { - size_t begin = ids_lod[i] * ids_count; - for (int64_t j = 0; j != ids_count; ++j) { - PADDLE_ENFORCE_LT(ids[begin], row_number); - PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); - blas.VCOPY(row_width, table + ids[begin + j] * row_width, - output + i * last_dim + j * row_width); - } - - for (int64_t r = (ids_lod[i] + 1) * ids_count; - r < ids_lod[i + 1] * ids_count; ++r) { - PADDLE_ENFORCE_LT(ids[r], row_number); - PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i); - blas.AXPY(row_width, 1., table + ids[r] * row_width, - output + i * last_dim + (r % ids_count) * row_width); - } + PADDLE_ENFORCE_LE(table_width * idx_width, out_width); + PADDLE_ENFORCE_GT(ids_lod.size(), 1UL, "The LoD[0] could NOT be empty"); + + jit::emb_seq_pool_attr_t attr(table_height, table_width, 0, idx_width, + out_width, jit::SeqPoolType::kSum); + for (size_t i = 0; i != ids_lod.size() - 1; ++i) { + attr.index_height = ids_lod[i + 1] - ids_lod[i]; + auto emb_seqpool = + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(attr); + emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width, + &attr); } } }; +inline int FusedEmbeddingSeqPoolLastDim(const framework::DDim &table_dims, + const framework::DDim &ids_dims) { + int64_t last_dim = table_dims[1]; + for (int i = 1; i != ids_dims.size(); ++i) { + last_dim *= ids_dims[i]; + } + return last_dim; +} + template class FusedEmbeddingSeqPoolKernel : public framework::OpKernel { public: @@ -76,6 +79,17 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel { const LoDTensor *table_var = context.Input("W"); const std::string &combiner_type = context.Attr("combiner"); + int64_t last_dim = + FusedEmbeddingSeqPoolLastDim(table_var->dims(), ids_t->dims()); + const auto &ids_lod = ids_t->lod(); + // in run time, the LoD of ids must be 1 + PADDLE_ENFORCE(ids_lod.size(), 1UL, + "The LoD level of Input(Ids) must be 1"); + int64_t batch_size = ids_lod[0].size() - 1; + // in run time, the shape from Ids -> output + // should be [seq_length, 1] -> [batch_size, last_dim] + output_t->Resize({batch_size, last_dim}); + if (combiner_type == "sum") { EmbeddingVSumFunctor functor; functor(context, table_var, ids_t, output_t); @@ -111,7 +125,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); auto lod = ids->lod()[0]; - int64_t row_width = d_output->dims()[1]; + int64_t out_width = d_output->dims()[1]; framework::Vector *new_rows = d_table->mutable_rows(); new_rows->resize(ids_num); @@ -122,15 +136,14 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { T *d_table_data = d_table_value->mutable_data(context.GetPlace()); const T *d_output_data = d_output->data(); - auto blas = math::GetBlas(context); + auto vbroadcast = + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(out_width); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { int64_t h = static_cast(lod[i + 1] - lod[i]); - int64_t in_offset = lod[i] * row_width; - const T *out_pos = d_output_data + i * row_width; - T *in_pos = d_table_data + in_offset; - for (int r = 0; r != h; ++r) { - blas.VCOPY(row_width, out_pos, in_pos + r * row_width); - } + const T *src = d_output_data + i * out_width; + T *dst = d_table_data + lod[i] * out_width; + vbroadcast(src, dst, h, out_width); } } else { LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now"; diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 66acba49e5ac25c5097042225ccfe30b258040fa..ba5f0747c4d04bbb41f34dc7f895b22d38392ea6 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -182,29 +182,32 @@ class FusionGRUKernel : public framework::OpKernel { const int total_T = x_dims[0]; \ const int D3 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - auto* h0 = ctx.Input("H0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* bias = ctx.Input("Bias"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const jit::gru_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("activation"))); \ - jit::gru_t one_step; \ - auto ComputeH1 = \ - jit::Get, platform::CPUPlace>(attr); \ - auto ComputeHtPart1 = \ - jit::Get, platform::CPUPlace>(attr); \ - auto ComputeHtPart2 = \ - jit::Get, platform::CPUPlace>(attr); \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - auto place = ctx.GetPlace(); \ +#define INIT_OTHER_DEFINES \ + auto* h0 = ctx.Input("H0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* bias = ctx.Input("Bias"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const jit::gru_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("activation"))); \ + jit::gru_t one_step; \ + auto ComputeH1 = \ + jit::KernelFuncs, platform::CPUPlace>::Cache().At( \ + attr); \ + auto ComputeHtPart1 = \ + jit::KernelFuncs, platform::CPUPlace>::Cache() \ + .At(attr); \ + auto ComputeHtPart2 = \ + jit::KernelFuncs, platform::CPUPlace>::Cache() \ + .At(attr); \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + auto place = ctx.GetPlace(); \ T* xx_data = xx->mutable_data(place) void SeqCompute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index b11b7c11bfe0ae4c79d5bb39844bce618649c44d..c8c07bd126d5b4eac688d43fd794856f8222525a 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -235,32 +235,34 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D = wh_dims[0]; \ const int D4 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wp_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } \ - const jit::lstm_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("candidate_activation")), \ - jit::to_kerneltype(ctx.Attr("cell_activation")), \ - use_peepholes); \ - jit::lstm_t one_step; \ - one_step.wp = wp_data; \ - one_step.checked = checked_cell_data; \ - auto ComputeC1H1 = \ - jit::Get, platform::CPUPlace>(attr); \ - auto ComputeCtHt = \ - jit::Get, platform::CPUPlace>(attr) +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const jit::lstm_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("candidate_activation")), \ + jit::to_kerneltype(ctx.Attr("cell_activation")), \ + use_peepholes); \ + jit::lstm_t one_step; \ + one_step.wp = wp_data; \ + one_step.checked = checked_cell_data; \ + auto ComputeC1H1 = \ + jit::KernelFuncs, platform::CPUPlace>::Cache().At( \ + attr); \ + auto ComputeCtHt = \ + jit::KernelFuncs, platform::CPUPlace>::Cache().At( \ + attr) // Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc index e9e2a3b1f5c1c00bb2e95b6171ecd09bfe7a0d21..6be35de65f48525b2da7d5c9ef260b2d0798b67b 100644 --- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc @@ -37,7 +37,7 @@ void FusionRepeatedFCReluOp::InferShape( "Output(Out) of FusionRepeatedFCReluOp should not be null."); auto i_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ(i_dims.size(), 2UL, "Input shape size should be 2"); + PADDLE_ENFORCE_EQ(i_dims.size(), 2, "Input shape size should be 2"); auto w_dims = ctx->GetInputsDim("W"); auto b_dims = ctx->GetInputsDim("Bias"); @@ -49,7 +49,7 @@ void FusionRepeatedFCReluOp::InferShape( "inpute width should be equal with weight height"); for (size_t i = 1; i < sz; ++i) { - PADDLE_ENFORCE_EQ(w_dims[i].size(), 2UL, + PADDLE_ENFORCE_EQ(w_dims[i].size(), 2, "Every weight shape size should be 2."); PADDLE_ENFORCE_EQ(framework::product(b_dims[i]), w_dims[i][1], "The length of Bias must be equal with w_dims[1]."); @@ -82,9 +82,11 @@ template static void fc_relu(const T* x, const T* w, const T* b, T* y, const jit::matmul_attr_t& attr) { auto matmul = - jit::Get, platform::CPUPlace>(attr); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); auto addbias_relu = - jit::Get, platform::CPUPlace>(attr.n); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr.n); matmul(x, w, y, &attr); T* dst = y; for (int i = 0; i < attr.m; ++i) { diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc index aaef46de0d3b88720a762abb000e42d560fbd8cf..d091da5aa8a7e7ec30798d68021bfd2b9b87b32f 100644 --- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc @@ -39,7 +39,7 @@ void FusionSeqExpandConcatFCOp::InferShape( auto ins_dims = ctx->GetInputsDim("X"); auto w_dims = ctx->GetInputDim("FCWeight"); // (M0+M1+M2+..) x D - PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, "Input(FCWeight)'s rank must be 2."); + PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Input(FCWeight)'s rank must be 2."); const int D = w_dims[1]; int sum = ins_dims[0][1]; for (size_t i = 1; i < ins_dims.size(); ++i) { diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc index b181140db750a8d1b74c0b6cc93259a208fe5b06..25916768c08e7222ba95bd6e1999400a923b21a3 100644 --- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc @@ -39,7 +39,7 @@ void FusionSeqPoolConcatOp::InferShape( // The output height should be confirmed in Compute, // since input lod is not accessible here. - PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2UL, + PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2, "The dims size of first input should be 2."); ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast(n)}); } @@ -98,7 +98,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel { attr.type = jit::SeqPoolType::kSqrt; } auto seqpool = - jit::Get, platform::CPUPlace>( + jit::KernelFuncs, platform::CPUPlace>::Cache().At( attr); size_t n = ins.size(); size_t dst_step_size = n * w; diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc index 8c8b079633aacb711aa304ec7016c37c6bec61ce..53679ebddee1ceec102b5861c54b398aa4da4cde 100644 --- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc +++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc @@ -42,7 +42,7 @@ void FusionSquaredMatSubOp::InferShape( auto y_dims = ctx->GetInputDim("Y"); PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(), "Input tensors dims size should be equal."); - PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input tensors should be a Matrix."); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input tensors should be a Matrix."); PADDLE_ENFORCE_EQ(x_dims[1], y_dims[0], "Inputs Matrix should be multiply."); ctx->SetOutputDim("SquaredX", x_dims); @@ -94,19 +94,23 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { int o_numel = attr.m * attr.n; auto vsquare_x = - jit::Get, platform::CPUPlace>(attr.m * - attr.k); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr.m * attr.k); auto vsquare_y = - jit::Get, platform::CPUPlace>(attr.k * - attr.n); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr.k * attr.n); auto vsquare_xy = - jit::Get, platform::CPUPlace>(o_numel); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + o_numel); auto vsub = - jit::Get, platform::CPUPlace>(o_numel); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + o_numel); auto vscal = - jit::Get, platform::CPUPlace>(o_numel); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + o_numel); auto matmul = - jit::Get, platform::CPUPlace>(attr); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); const T* x_data = x->data(); const T* y_data = y->data(); diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc index e18d9841bb87c6a684d53e1bceb6c20a37dcfcfa..cbdffa0db8277dbf7257c3b3c1d03c1b459d5b2b 100644 --- a/paddle/fluid/operators/group_norm_op.cc +++ b/paddle/fluid/operators/group_norm_op.cc @@ -170,13 +170,48 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker { } }; +class GroupNormInplaceInToOut : public framework::InplaceInToOut { + public: + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + return {{"X", "Y"}}; + } +}; + +class GroupNormGradInplaceInToOut : public framework::InplaceInToOut { + public: + using InplaceInToOut::InplaceInToOut; + + protected: + std::unordered_map Apply( + const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + return {{framework::GradVarName("Y"), framework::GradVarName("X")}}; + } +}; + +class GroupNormOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return {{"X", /*->*/ "Y"}}; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker, - ops::GroupNormGradMaker); -REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp); + ops::GroupNormOpInferVarType, ops::GroupNormGradMaker, + ops::GroupNormInplaceInToOut); +REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp, + ops::GroupNormGradInplaceInToOut); REGISTER_OP_CPU_KERNEL( group_norm, ops::GroupNormKernel, ops::GroupNormKernel); diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc index b2c2c7954b79658e66f1524a81bcad0b7bf22c35..7a29f80ff1ce413519ea9cea6a35747bdced5885 100644 --- a/paddle/fluid/operators/hash_op.cc +++ b/paddle/fluid/operators/hash_op.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/hash_op.h" #include -#include namespace paddle { namespace operators { @@ -27,6 +26,9 @@ class HashOp : public framework::OperatorWithKernel { : OperatorWithKernel(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { + if (ctx->IsRuntime()) { + return; + } PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of HashOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), @@ -36,15 +38,8 @@ class HashOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(dims.size(), 2UL, "The input of hash_op's dimensions must be 2"); std::vector out_dims; - out_dims.reserve(dims.size() + 1); - // copy all dims except the last one - for (int i = 0u; i != dims.size() - 1; ++i) { - out_dims.emplace_back(dims[i]); - } int num_hash = ctx->Attrs().Get("num_hash"); - out_dims.emplace_back(num_hash); - // keep the last dim to 1 - out_dims.emplace_back(1); + HashOutputSize(dims, out_dims, num_hash); ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); ctx->ShareLoD("X", /*->*/ "Out"); @@ -71,4 +66,4 @@ $$Out = scale * X$$ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(hash, ops::HashOp, ops::HashOpMaker); -REGISTER_OP_CPU_KERNEL(hash, ops::HashKerel, ops::HashKerel); +REGISTER_OP_CPU_KERNEL(hash, ops::HashKernel, ops::HashKernel); diff --git a/paddle/fluid/operators/hash_op.h b/paddle/fluid/operators/hash_op.h index 9781bb0f453642cefb3eb59a05389c339a7de39d..9e7ad5235ff483a2fc0cfbb8bc35c620084bb896 100644 --- a/paddle/fluid/operators/hash_op.h +++ b/paddle/fluid/operators/hash_op.h @@ -17,21 +17,34 @@ limitations under the License. */ extern "C" { #include } +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { -// template + +inline void HashOutputSize(const framework::DDim& in_dims, + std::vector& out_dims, // NOLINT + int num_hash) { + out_dims.reserve(in_dims.size() + 1); + // copy all dims except the last one + for (int i = 0u; i != in_dims.size() - 1; ++i) { + out_dims.emplace_back(in_dims[i]); + } + out_dims.emplace_back(num_hash); + // keep the last dim to 1 + out_dims.emplace_back(1); +} + template -class HashKerel : public framework::OpKernel { +class HashKernel : public framework::OpKernel { public: virtual void Compute(const framework::ExecutionContext& context) const { auto* out_t = context.Output("Out"); auto* in_t = context.Input("X"); int mod_by = context.Attr("mod_by"); int num_hash = context.Attr("num_hash"); - auto* output = out_t->mutable_data(context.GetPlace()); auto in_dims = in_t->dims(); auto in_lod = in_t->lod(); @@ -39,6 +52,11 @@ class HashKerel : public framework::OpKernel { static_cast(in_dims[0]), in_lod[0].back(), "The actual input data's size mismatched with LoD information."); + std::vector out_dims; + HashOutputSize(in_dims, out_dims, num_hash); + out_t->Resize(framework::make_ddim(out_dims)); + auto* output = out_t->mutable_data(context.GetPlace()); + auto seq_length = in_dims[0]; auto last_dim = in_dims[in_dims.size() - 1]; auto* input = in_t->data(); @@ -49,6 +67,7 @@ class HashKerel : public framework::OpKernel { } input += last_dim; } + out_t->set_lod(in_t->lod()); } }; diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index de91ba6270ac2ed22c8380878c0a0037fb1629c0..10d01af982d01800bdd2d5d59761cfb09e2a8139 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -84,13 +84,13 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault("bilinear"); AddAttr( "align_corners", - "an optinal bool. Defaults to True. " + "an optional bool. Defaults to True. " "If True, the centers of 4 corner pixels of the input and output " "tensors are aligned, preserving the values at the corner pixels, " - "if Flase, are not aligned") + "If False, are not aligned") .SetDefault(true); AddAttr("align_mode", - "(int, default \'1\'), optional for bilinear interpolation" + "(int, default \'1\'), optional for bilinear interpolation, " "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , " "can be \'1\' for src_idx = scale*dst_index .") .SetDefault(1); diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc index ba50bdf34baf2b9b0748b24c98c274aa18e22e36..092a6eae6f5b7edcc5656522377de10a08a01ea8 100644 --- a/paddle/fluid/operators/is_empty_op.cc +++ b/paddle/fluid/operators/is_empty_op.cc @@ -34,9 +34,8 @@ class IsEmptyOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - framework::OpKernelType kt = framework::OpKernelType( - ctx.Input("X")->type(), platform::CPUPlace()); - return kt; + auto *x = ctx.Input("X"); + return framework::OpKernelType(x->type(), x->place()); } }; @@ -58,7 +57,6 @@ It will just return product(tensor.ddims()) > 0; } // namespace paddle namespace ops = paddle::operators; - REGISTER_OPERATOR(is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker, paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/is_empty_op.cu.cc b/paddle/fluid/operators/is_empty_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c256503baf6ba3bc8f8dff866a2ce9c57ec5bf1 --- /dev/null +++ b/paddle/fluid/operators/is_empty_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/is_empty_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + is_empty, ops::IsEmptyOpKernel, + ops::IsEmptyOpKernel, + ops::IsEmptyOpKernel, + ops::IsEmptyOpKernel); diff --git a/paddle/fluid/operators/is_empty_op.h b/paddle/fluid/operators/is_empty_op.h index 3e3af22fa8d842b6a1e67418446f1a40949e046b..4f6419eb577709836275481cf617c07ea6c7f4c0 100644 --- a/paddle/fluid/operators/is_empty_op.h +++ b/paddle/fluid/operators/is_empty_op.h @@ -28,6 +28,9 @@ class IsEmptyOpKernel : public framework::OpKernel { // get output auto* output_tensor = context.Output("Out"); + // Note: is_empty is always executed on CPU and the output data should + // always be allocated for CPUPlace. We reigister CUDA kernel for this op to + // avoid the unnecessary data transform. output_tensor->mutable_data(platform::CPUPlace())[0] = framework::product(input_tensor->dims()) == 0; } diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt index 35775d7ec9efcdbad69e4491792f7d4e513832ad..47d6c83f2adf8c4b7476410ce7c1d435633a8bfe 100644 --- a/paddle/fluid/operators/jit/CMakeLists.txt +++ b/paddle/fluid/operators/jit/CMakeLists.txt @@ -5,7 +5,7 @@ file(APPEND ${jit_file} "\#pragma once\n") file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n") file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n") -set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) +set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place xxhash) file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 97ddf223aefcdfaf8a488f93a152336c1ed458f4..fbb04a166ef52efd9bd05f27ca656d928d97fb96 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -59,8 +59,6 @@ BenchJITKernel* InsertBenchmark(BenchJITKernel* b) { InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_()); \ void BenchJITKernel_##name##_##dtype##_##place##_::Run() -#define BENCH_FP32_CPU(name) BENCH_JITKERNEL(name, FP32, CPU) - void RUN_ALL_BENCHMARK() { for (auto p : g_all_benchmarks) { if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) { @@ -90,11 +88,11 @@ std::vector TestSizes() { return s; } -template +template struct BenchFunc { // return this function avg time // TODO(TJ): clear cache every time - double operator()(const typename KernelTuples::func_type tgt, Args... args) { + double operator()(const typename KernelTuple::func_type tgt, Args... args) { for (int i = 0; i < FLAGS_burning; ++i) { tgt(args...); } @@ -109,40 +107,17 @@ struct BenchFunc { namespace jit = paddle::operators::jit; -template -void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { - BenchFunc benchmark; +template +void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) { + BenchFunc benchmark; std::vector> infos; - // test refer - auto refer = jit::GetRefer(); - if (!refer) { - LOG(FATAL) << "Refer can not be empty!"; + auto funcs = jit::GetAllCandidateFuncsWithTypes(attr); + for (auto f : funcs) { + infos.push_back(std::make_pair(f.first, benchmark(f.second, args...))); } - infos.push_back(std::make_pair("Refer", benchmark(refer, args...))); - // test jitcode - auto jitcode = jit::GetJitCode(attr); - if (jitcode) { - infos.push_back(std::make_pair("JitCode", benchmark(jitcode, args...))); - } - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); - if (i && i->UseMe(attr)) { - auto more = i->GetFunc(); - infos.push_back( - std::make_pair(i->ImplType(), benchmark(more, args...))); - } - } - } // Test result from Get function - auto tgt = jit::Get(attr); + auto tgt = jit::KernelFuncs::Cache().At(attr); if (!tgt) { LOG(FATAL) << "Target can not be empty!"; } @@ -150,7 +125,8 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { // print std::ostringstream loginfos; - loginfos << "Kernel Type " << jit::to_string(KT) << ": " << attr << ": "; + loginfos << "Kernel Type " << jit::to_string(KernelTuple::kernel_type) << ": " + << attr << ": "; for (auto pair : infos) { loginfos << pair.first << " takes " << pair.second << " us; "; } @@ -159,8 +135,9 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { using Tensor = paddle::framework::Tensor; -template -void BenchXYZNKernel() { +template +void BenchKernelXYZN() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { Tensor x, y, z; x.Resize({d}); @@ -171,16 +148,16 @@ void BenchXYZNKernel() { T* z_data = z.mutable_data(PlaceType()); RandomVec(d, x_data); RandomVec(d, y_data); - BenchAllImpls, PlaceType>(d, x.data(), - y.data(), z_data, d); + BenchAllImpls(d, x.data(), y.data(), z_data, + d); // test inplace - BenchAllImpls, PlaceType>(d, x.data(), z_data, - z_data, d); + BenchAllImpls(d, x.data(), z_data, z_data, d); } } -template -void BenchAXYNKernel() { +template +void BenchKernelAXYN() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { const T a = static_cast(3); Tensor x, y; @@ -189,26 +166,26 @@ void BenchAXYNKernel() { T* x_data = x.mutable_data(PlaceType()); T* y_data = y.mutable_data(PlaceType()); RandomVec(d, x_data); - BenchAllImpls, PlaceType>(d, &a, x.data(), y_data, - d); + BenchAllImpls(d, &a, x.data(), y_data, d); // test inplace - BenchAllImpls, PlaceType>(d, &a, x.data(), x_data, - d); + BenchAllImpls(d, &a, x.data(), x_data, d); } } -template -void BenchXRNKernel() { +template +void BenchKernelXRN() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { Tensor x; RandomVec(d, x.mutable_data({d}, PlaceType())); T res; - BenchAllImpls, PlaceType>(d, x.data(), &res, d); + BenchAllImpls(d, x.data(), &res, d); } } -template -void BenchXYNKernel() { +template +void BenchKernelXYN() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { Tensor x, y; x.Resize({d}); @@ -216,12 +193,13 @@ void BenchXYNKernel() { T* x_data = x.mutable_data(PlaceType()); T* y_data = y.mutable_data(PlaceType()); RandomVec(d, x_data); - BenchAllImpls, PlaceType>(d, x.data(), y_data, d); + BenchAllImpls(d, x.data(), y_data, d); } } -template -void BenchLSTMKernel() { +template +void BenchKernelLSTM() { + using T = typename KernelTuple::data_type; for (bool use_peephole : {true, false}) { for (int d : TestSizes()) { const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh, @@ -252,13 +230,14 @@ void BenchLSTMKernel() { step.wp = wp_data; step.checked = checked_data; } - BenchAllImpls, PlaceType>(attr, &step, &attr); + BenchAllImpls(attr, &step, &attr); } } } -template -void BenchGRUKernel() { +template +void BenchKernelGRU() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh); auto place = PlaceType(); @@ -275,12 +254,13 @@ void BenchGRUKernel() { step.gates = x_data; step.ht_1 = ht_1_data; step.ht = ht_data; - BenchAllImpls, PlaceType>(attr, &step, &attr); + BenchAllImpls(attr, &step, &attr); } } -template -void BenchSeqPoolKernel() { +template +void BenchKernelSeqPool() { + using T = typename KernelTuple::data_type; std::vector pool_types = { jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; for (auto type : pool_types) { @@ -294,15 +274,87 @@ void BenchSeqPoolKernel() { RandomVec(h * w, x.mutable_data(PlaceType()), -2.f, 2.f); const T* x_data = x.data(); T* y_data = y.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>(attr, x_data, - y_data, &attr); + BenchAllImpls(attr, x_data, y_data, &attr); + } + } + } +} + +template +void BenchKernelEmbSeqPool() { + using T = typename KernelTuple::data_type; + std::vector pool_types = {jit::SeqPoolType::kSum}; + int64_t tbl_h = 1e4; + for (int tbl_w : {10, 16, 256}) { + Tensor table; + table.Resize({tbl_h, tbl_w}); + RandomVec(tbl_h * tbl_w, table.mutable_data(PlaceType()), -2.f, 2.f); + const T* table_data = table.data(); + for (auto type : pool_types) { + for (int idx_w : {1, 2, 10, 16}) { + for (int idx_h : {1, 2, 9, 13, 16}) { + int64_t out_w = tbl_w * idx_w; + jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w, + type); + Tensor idx, out; + idx.Resize({idx_h, idx_w}); + out.Resize({out_w}); + RandomVec(idx_h * idx_w, + idx.mutable_data(PlaceType()), 0, + tbl_h - 1); + const int64_t* idx_data = idx.data(); + T* o_data = out.mutable_data(PlaceType()); + BenchAllImpls(attr, table_data, idx_data, + o_data, &attr); + } } } } } -template -void BenchMatMulKernel() { +template +void BenchKernelSgd() { + using T = typename KernelTuple::data_type; + const T lr = 0.1; + auto UnDuplicatedRandomVec = [](int n, const int64_t lower, + const int64_t upper) -> std::vector { + PADDLE_ENFORCE_LE(static_cast(upper - lower), n - 1); + PADDLE_ENFORCE_GT(n, 0); + std::vector all, out; + for (int i = 0; i < n; ++i) { + all.push_back(i); + } + std::random_shuffle(all.begin(), all.end()); + out.insert(out.begin(), all.begin(), all.begin() + n); + return out; + }; + for (int param_h : {1, 1000}) { + for (int grad_w : {1, 2, 8, 16, 30, 256}) { + // only benchmark inplace + Tensor param; + param.Resize({param_h, grad_w}); + T* param_data = param.mutable_data(PlaceType()); + RandomVec(param_h * grad_w, param_data, -2.f, 2.f); + for (int rows_size = 1; rows_size <= std::min(param_h, 10); ++rows_size) { + Tensor grad; + grad.Resize({rows_size, grad_w}); + std::vector rows = + UnDuplicatedRandomVec(rows_size, 0, rows_size - 1); + RandomVec(rows_size * grad_w, grad.mutable_data(PlaceType()), + -2.f, 2.f); + const T* grad_data = grad.data(); + const int64_t* rows_data = rows.data(); + jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size); + BenchAllImpls(attr, &lr, param_data, grad_data, + rows_data, param_data, &attr); + } + } + } +} + +template +void BenchKernelMatMul() { + using T = typename KernelTuple::data_type; for (int m : {1, 2, 3, 4}) { for (int n : TestSizes()) { for (int k : TestSizes()) { @@ -316,15 +368,16 @@ void BenchMatMulKernel() { const T* b_data = b.data(); T* c_data = c.mutable_data(PlaceType()); const jit::matmul_attr_t attr{m, n, k}; - BenchAllImpls, PlaceType>(attr, a_data, b_data, - c_data, &attr); + BenchAllImpls(attr, a_data, b_data, c_data, + &attr); } } } } -template -void BenchSoftmaxKernel() { +template +void BenchKernelSoftmax() { + using T = typename KernelTuple::data_type; for (int bs : {1, 2, 10}) { for (int n : TestSizes()) { Tensor x, y; @@ -333,54 +386,170 @@ void BenchSoftmaxKernel() { RandomVec(bs * n, x.mutable_data(PlaceType()), -2.f, 2.f); const T* x_data = x.data(); T* y_data = y.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>(n, x_data, y_data, n, - bs); + BenchAllImpls(n, x_data, y_data, n, bs); + } + } +} + +template +void BenchKernelLayerNorm() { + using T = typename KernelTuple::data_type; + const T epsilon = 9.99999975e-06; + for (int n : {1, 2, 10}) { + for (int x_dim_0 : {1, 9, 17, 50}) { + int left = n * x_dim_0; + for (int x_dim_1 : TestSizes()) { + int right = x_dim_1; + int sz = left * right; + Tensor x, mean, var, scale, bias, out; + x.Resize({n, x_dim_0, x_dim_1}); + out.Resize({n, x_dim_0, x_dim_1}); + mean.Resize({n, x_dim_0}); + var.Resize({n, x_dim_0}); + scale.Resize({x_dim_1}); + bias.Resize({x_dim_1}); + + RandomVec(sz, x.mutable_data(PlaceType()), -2.f, 2.f); + RandomVec(left, mean.mutable_data(PlaceType()), -2.f, 2.f); + RandomVec(left, var.mutable_data(PlaceType()), -2.f, 2.f); + RandomVec(right, scale.mutable_data(PlaceType()), -2.f, 2.f); + RandomVec(right, bias.mutable_data(PlaceType()), -2.f, 2.f); + + const T* scale_data = scale.data(); + const T* bias_data = bias.data(); + T* x_data = x.data(); + T* mean_data = mean.data(); + T* var_data = var.data(); + T* out_data = out.mutable_data(PlaceType()); + + BenchAllImpls(right, x_data, out_data, + mean_data, var_data, scale_data, + bias_data, left, epsilon, right); + } + } + } +} + +template +void BenchKernelCRFDecoding() { + using T = typename KernelTuple::data_type; + constexpr int state_trans_base_idx = 2; + for (int seq_len : {1, 11, 17, 50}) { + for (int tag_num : TestSizes()) { + int x_sz = seq_len * tag_num; + int w_sz = (tag_num + state_trans_base_idx) * tag_num; + Tensor x, w, alpha, track; + x.Resize({seq_len, tag_num}); + w.Resize({tag_num + state_trans_base_idx, tag_num}); + alpha.Resize({seq_len, tag_num}); + track.Resize({seq_len, tag_num}); + + RandomVec(x_sz, x.mutable_data(PlaceType()), -2.f, 2.f); + RandomVec(w_sz, w.mutable_data(PlaceType()), -2.f, 2.f); + + const T* x_data = x.data(); + const T* w_data = w.data(); + T* alpha_data = alpha.mutable_data(PlaceType()); + int* track_data = track.mutable_data(PlaceType()); + + BenchAllImpls(tag_num, seq_len, x_data, w_data, + alpha_data, track_data, tag_num); + } + } +} + +template +void BenchKernelVBroadcast() { + using T = typename KernelTuple::data_type; + for (int64_t w : {1, 16, 64, 100, 256}) { + Tensor x; + x.Resize({w}); + RandomVec(w, x.mutable_data(PlaceType())); + const T* x_data = x.data(); + for (int h : TestSizes()) { + Tensor y; + y.Resize({h * w}); + T* y_data = y.mutable_data(PlaceType()); + BenchAllImpls(w, x_data, y_data, + static_cast(h), w); } } } -using T = float; +#define BenchKernelVMul BenchKernelXYZN +#define BenchKernelVAdd BenchKernelXYZN +#define BenchKernelVAddRelu BenchKernelXYZN +#define BenchKernelVSub BenchKernelXYZN + +#define BenchKernelVScal BenchKernelAXYN +#define BenchKernelVAddBias BenchKernelAXYN + +#define BenchKernelVRelu BenchKernelXYN +#define BenchKernelVIdentity BenchKernelXYN +#define BenchKernelVSquare BenchKernelXYN +#define BenchKernelVExp BenchKernelXYN +#define BenchKernelVSigmoid BenchKernelXYN +#define BenchKernelVTanh BenchKernelXYN +#define BenchKernelVCopy BenchKernelXYN + +#define BenchKernelHMax BenchKernelXRN +#define BenchKernelHSum BenchKernelXRN + +#define BenchKernelLSTMCtHt BenchKernelLSTM +#define BenchKernelLSTMC1H1 BenchKernelLSTM + +#define BenchKernelGRUH1 BenchKernelGRU +#define BenchKernelGRUHtPart1 BenchKernelGRU +#define BenchKernelGRUHtPart2 BenchKernelGRU + using CPUPlace = paddle::platform::CPUPlace; +#define BENCH_FP32_CPU(name) \ + BENCH_JITKERNEL(name, FP32, CPU) { \ + BenchKernel##name, CPUPlace>(); \ + } + // xyzn -BENCH_FP32_CPU(kVMul) { BenchXYZNKernel(); } -BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel(); } -BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel(); } -BENCH_FP32_CPU(kVSub) { BenchXYZNKernel(); } +BENCH_FP32_CPU(VMul); +BENCH_FP32_CPU(VAdd); +BENCH_FP32_CPU(VAddRelu); +BENCH_FP32_CPU(VSub); // axyn -BENCH_FP32_CPU(kVScal) { BenchAXYNKernel(); } -BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel(); } - -// xrn -BENCH_FP32_CPU(kHSum) { BenchXRNKernel(); } -BENCH_FP32_CPU(kHMax) { BenchXRNKernel(); } +BENCH_FP32_CPU(VScal); +BENCH_FP32_CPU(VAddBias); // xyn -BENCH_FP32_CPU(kVRelu) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVSquare) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVExp) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVTanh) { BenchXYNKernel(); } - -// lstm and peephole -BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel(); } -BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel(); } - -// gru functions -BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel(); } -BENCH_FP32_CPU(kGRUHtPart1) { BenchGRUKernel(); } -BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel(); } - -// seq pool function -BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel(); } - -// matmul -BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel(); } - -// softmax -BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel(); } +BENCH_FP32_CPU(VRelu); +BENCH_FP32_CPU(VIdentity); +BENCH_FP32_CPU(VSquare); +BENCH_FP32_CPU(VExp); +BENCH_FP32_CPU(VSigmoid); +BENCH_FP32_CPU(VTanh); +BENCH_FP32_CPU(VCopy); + +// xrn +BENCH_FP32_CPU(HMax); +BENCH_FP32_CPU(HSum); + +// LSTM +BENCH_FP32_CPU(LSTMCtHt); +BENCH_FP32_CPU(LSTMC1H1); + +// GRU +BENCH_FP32_CPU(GRUH1); +BENCH_FP32_CPU(GRUHtPart1); +BENCH_FP32_CPU(GRUHtPart2); + +BENCH_FP32_CPU(LayerNorm); +BENCH_FP32_CPU(CRFDecoding); + +BENCH_FP32_CPU(SeqPool); +BENCH_FP32_CPU(EmbSeqPool); +BENCH_FP32_CPU(MatMul); +BENCH_FP32_CPU(Softmax); +BENCH_FP32_CPU(Sgd); +BENCH_FP32_CPU(VBroadcast); // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index efc7eb79d36c5cf9fac4ac40db4e2e28cb242e22..99244ea9bd919a018732b75d1ab811e8bf338516 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -31,3 +31,6 @@ USE_JITKERNEL_GEN(kNCHW16CMulNC) USE_JITKERNEL_GEN(kSeqPool) USE_JITKERNEL_GEN(kHMax) USE_JITKERNEL_GEN(kHSum) +USE_JITKERNEL_GEN(kEmbSeqPool) +USE_JITKERNEL_GEN(kSgd) +USE_JITKERNEL_GEN(kVBroadcast) diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc index e7a7375879064eb27c94315fe7b93eece7866b92..ad68e792c7a8ec4fb600a5b04153ad45895d761a 100644 --- a/paddle/fluid/operators/jit/gen/act.cc +++ b/paddle/fluid/operators/jit/gen/act.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/act.h" +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -81,7 +82,7 @@ void VActJitCode::genCode() { #define DECLARE_ACT_CREATOR(name) \ class name##Creator : public JitCodeCreator { \ public: \ - bool UseMe(const int& attr) const override; \ + bool CanBeUsed(const int& attr) const override; \ size_t CodeSize(const int& d) const override; \ std::unique_ptr CreateJitCode(const int& attr) const override { \ return make_unique(attr, CodeSize(attr)); \ @@ -96,27 +97,27 @@ DECLARE_ACT_CREATOR(VSigmoid); DECLARE_ACT_CREATOR(VTanh); // TODO(TJ): tuning use me -bool VReluCreator::UseMe(const int& d) const { +bool VReluCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } -bool VSquareCreator::UseMe(const int& d) const { +bool VSquareCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } -bool VIdentityCreator::UseMe(const int& d) const { +bool VIdentityCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } -bool VExpCreator::UseMe(const int& d) const { +bool VExpCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx) && d < 32; } -bool VSigmoidCreator::UseMe(const int& d) const { +bool VSigmoidCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } -bool VTanhCreator::UseMe(const int& d) const { +bool VTanhCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc index 5da24c359edd2df93333fe0ca8a18cdc7385aadb..c126b9077ae50f528074210bae39227a9fcd3277 100644 --- a/paddle/fluid/operators/jit/gen/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/blas.h" +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -142,7 +143,7 @@ void NCHW16CMulNCJitCode::genCode() { class NCHW16CMulNCCreator : public JitCodeCreator { public: - bool UseMe(const int& attr) const override { + bool CanBeUsed(const int& attr) const override { return platform::MayIUse(platform::avx512f); } size_t CodeSize(const int& d) const override { return 256 * 1024; } @@ -154,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator { #define DECLARE_BLAS_CREATOR(name) \ class name##Creator : public JitCodeCreator { \ public: \ - bool UseMe(const int& attr) const override { \ + bool CanBeUsed(const int& attr) const override { \ return platform::MayIUse(platform::avx) && attr <= 1024; \ } \ size_t CodeSize(const int& d) const override { \ diff --git a/paddle/fluid/operators/jit/gen/embseqpool.cc b/paddle/fluid/operators/jit/gen/embseqpool.cc new file mode 100644 index 0000000000000000000000000000000000000000..331a4b0d0753b37843c3d112256abfbabe9a4913 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/embseqpool.cc @@ -0,0 +1,150 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/embseqpool.h" +#include // offsetof +#include +#include +#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void EmbSeqPoolJitCode::genCode() { + preCode(); + constexpr int block = YMM_FLOAT_BLOCK; + constexpr int max_num_regs = 8; + const int num_block = tbl_w_ / block; + const int num_groups = num_block / max_num_regs; + const size_t block_size = sizeof(float) * block; + std::vector groups(num_groups, max_num_regs); + int rest_num_regs = num_block % max_num_regs; + if (rest_num_regs > 0) { + groups.push_back(rest_num_regs); + } + + // protect param_dst + mov(reg_ptr_param_dst, param_dst); + mov(reg_idx_width_in_byte, + qword[param_attr + offsetof(emb_seq_pool_attr_t, index_width)]); + mov(reg_idx_height, + qword[param_attr + offsetof(emb_seq_pool_attr_t, index_height)]); + mov(rax, sizeof(int64_t)); + mul(reg_idx_width_in_byte); + mov(reg_idx_width_in_byte, rax); + const size_t tbl_width_in_byte = sizeof(float) * tbl_w_; + int acc_num_regs = 0; + for (int num_regs : groups) { + Label l_next_idx_w, l_next_idx_h, l_save_now; + xor_(reg_idx_w_i_in_byte, reg_idx_w_i_in_byte); + mov(reg_ptr_dst_i, reg_ptr_param_dst); + add(reg_ptr_dst_i, acc_num_regs * block_size); + + L(l_next_idx_w); + { + // h == 0 + mov(reg_ptr_idx_i, param_idx); + add(reg_ptr_idx_i, reg_idx_w_i_in_byte); + mov(reg_idx, qword[reg_ptr_idx_i]); + mov(rax, tbl_width_in_byte); + mul(reg_idx); + mov(reg_ptr_tbl_i, rax); // reg is offset now + add(reg_ptr_tbl_i, param_tbl); // reg is ptr_i now + size_t w_offset = 0; + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_tbl_i + w_offset]); + w_offset += block_size; + } + add(reg_ptr_idx_i, reg_idx_width_in_byte); + + // end condition of idx h + mov(reg_idx_h_end, reg_idx_height); + mov(rax, reg_idx_width_in_byte); + mul(reg_idx_h_end); + mov(reg_idx_h_end, rax); + add(reg_idx_h_end, reg_idx_w_i_in_byte); + add(reg_idx_h_end, param_idx); + + cmp(reg_ptr_idx_i, reg_idx_h_end); + jge(l_save_now, T_NEAR); + L(l_next_idx_h); + { + mov(reg_idx, qword[reg_ptr_idx_i]); + mov(reg_ptr_tbl_i, reg_idx); + mov(rax, tbl_width_in_byte); + mul(reg_idx); + mov(reg_ptr_tbl_i, rax); + add(reg_ptr_tbl_i, param_tbl); + size_t w_offset = 0; + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmovups(ymm_t(reg_i), ptr[reg_ptr_tbl_i + w_offset]); + vaddps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs), + ymm_t(reg_i)); + w_offset += block_size; + } + add(reg_ptr_idx_i, reg_idx_width_in_byte); + cmp(reg_ptr_idx_i, reg_idx_h_end); + jl(l_next_idx_h, T_NEAR); + } // end of idx h + L(l_save_now); + // avg or sqrt here, if needed + w_offset = 0; + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i + num_regs)); + w_offset += block_size; + } + add(reg_ptr_dst_i, tbl_width_in_byte); + add(reg_idx_w_i_in_byte, sizeof(int64_t)); + cmp(reg_idx_w_i_in_byte, reg_idx_width_in_byte); + jl(l_next_idx_w, T_NEAR); + } // end of idx w + + acc_num_regs += num_regs; + add(param_tbl, num_regs * block_size); // do not use acc_num_regs + } // end of groups + postCode(); +} + +class EmbSeqPoolCreator : public JitCodeCreator { + public: + bool CanBeUsed(const emb_seq_pool_attr_t& attr) const override { + return platform::MayIUse(platform::avx) && + attr.table_width % YMM_FLOAT_BLOCK == 0; + } + size_t CodeSize(const emb_seq_pool_attr_t& attr) const override { + return 96 + (attr.table_width / YMM_FLOAT_BLOCK) * 96 * 8; + } + std::unique_ptr CreateJitCode( + const emb_seq_pool_attr_t& attr) const override { + PADDLE_ENFORCE_GT(attr.table_height, 0); + PADDLE_ENFORCE_GT(attr.table_width, 0); + PADDLE_ENFORCE_GT(attr.index_height, 0); + PADDLE_ENFORCE_GT(attr.index_width, 0); + PADDLE_ENFORCE_GT(attr.out_width, 0); + return make_unique(attr, CodeSize(attr)); + } +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(kEmbSeqPool, gen::EmbSeqPoolCreator); diff --git a/paddle/fluid/operators/jit/gen/embseqpool.h b/paddle/fluid/operators/jit/gen/embseqpool.h new file mode 100644 index 0000000000000000000000000000000000000000..5afcfbdc1786bef160864fcde06f8738207751be --- /dev/null +++ b/paddle/fluid/operators/jit/gen/embseqpool.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +class EmbSeqPoolJitCode : public JitCode { + public: + explicit EmbSeqPoolJitCode(const emb_seq_pool_attr_t& attr, + size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), + tbl_w_(attr.table_width), + type_(attr.pool_type) { + if (type_ != SeqPoolType::kSum) { + LOG(FATAL) << "Only support sum pool yet "; + } + this->genCode(); + } + + std::string name() const override { + std::string base = "EmbSeqPoolJitCode"; + if (type_ == SeqPoolType::kSum) { + base += "_Sum"; + } else if (type_ == SeqPoolType::kAvg) { + base += "_Avg"; + } else if (type_ == SeqPoolType::kSqrt) { + base += "_Sqrt"; + } + base += ("_W" + std::to_string(tbl_w_)); + return base; + } + void genCode() override; + + private: + int tbl_w_; + SeqPoolType type_; + reg64_t param_tbl{abi_param1}; + reg64_t param_idx{abi_param2}; + reg64_t param_dst{abi_param3}; + reg64_t param_attr{abi_param4}; + + reg64_t reg_tmp{rax}; + + reg64_t reg_idx_width_in_byte{r8}; + reg64_t reg_idx_height{r9}; + + reg64_t reg_ptr_tbl_i{r10}; + reg64_t reg_idx{r10}; // could use same of reg_ptr_tbl_i + reg64_t reg_ptr_idx_i{r11}; + reg64_t reg_ptr_dst_i{r12}; + reg64_t reg_ptr_param_dst{r13}; // rdx is used in mul so protect param_dst + + reg64_t reg_idx_w_i_in_byte{r14}; + reg64_t reg_idx_h_end{r15}; +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/gen/gru.cc b/paddle/fluid/operators/jit/gen/gru.cc index 13f7a14111a80632a06c7fc632da47c0802828f7..b5b0cffa80612c61829766027013f172962b5069 100644 --- a/paddle/fluid/operators/jit/gen/gru.cc +++ b/paddle/fluid/operators/jit/gen/gru.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/jit/gen/gru.h" #include // offsetof +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -86,7 +87,7 @@ void GRUJitCode::genCode() { class name##Creator : public JitCodeCreator { \ public: \ /* TODO(TJ): enable more */ \ - bool UseMe(const gru_attr_t& attr) const override { \ + bool CanBeUsed(const gru_attr_t& attr) const override { \ return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \ } \ size_t CodeSize(const gru_attr_t& attr) const override { \ diff --git a/paddle/fluid/operators/jit/gen/hopv.cc b/paddle/fluid/operators/jit/gen/hopv.cc index e7884017198623d996fe98a55691da6e342d656a..462ac68a932e14b1200d503a937a35454c0e0618 100644 --- a/paddle/fluid/operators/jit/gen/hopv.cc +++ b/paddle/fluid/operators/jit/gen/hopv.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/hopv.h" +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -76,7 +77,7 @@ void HOPVJitCode::genCode() { #define DECLARE_HOP_CREATOR(name) \ class name##Creator : public JitCodeCreator { \ public: \ - bool UseMe(const int& attr) const override { \ + bool CanBeUsed(const int& attr) const override { \ return platform::MayIUse(platform::avx); \ } \ size_t CodeSize(const int& d) const override { \ diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index 689df8b1cbb7a928c9f9175d28a8231b56e2e82e..228db7cc721099750da30adeaa828ae31f521422 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -31,7 +31,8 @@ namespace gen { // Application Binary Interface constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI), abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX), - abi_param4(Xbyak::Operand::RCX); + abi_param4(Xbyak::Operand::RCX), abi_param5(Xbyak::Operand::R8), + abi_param6(Xbyak::Operand::R9); constexpr Xbyak::Operand::Code g_abi_regs[] = { Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12, @@ -72,7 +73,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator { virtual void genCode() = 0; size_t getSize() const override { return CodeGenerator::getSize(); } - const unsigned char* getCodeInternal() override { + const unsigned char* getCodeInternal() const override { const Xbyak::uint8* code = CodeGenerator::getCode(); return code; } diff --git a/paddle/fluid/operators/jit/gen/lstm.cc b/paddle/fluid/operators/jit/gen/lstm.cc index 08bafb5a81882072129a4bfa86d5aff2d33a79a1..2c3bc985e9a8b224835d848d30e0a3ef641ed2f9 100644 --- a/paddle/fluid/operators/jit/gen/lstm.cc +++ b/paddle/fluid/operators/jit/gen/lstm.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/jit/gen/lstm.h" #include // offsetof +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -114,7 +115,7 @@ void LSTMJitCode::genCode() { class name##Creator : public JitCodeCreator { \ public: \ /* TODO(TJ): enable more */ \ - bool UseMe(const lstm_attr_t& attr) const override { \ + bool CanBeUsed(const lstm_attr_t& attr) const override { \ return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \ } \ size_t CodeSize(const lstm_attr_t& attr) const override { \ diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc index ae3858eab20aeb80553d8fcec4088a6632c9c17d..d9955c8cc655f86bbc6c8135bdfa6c83493727f2 100644 --- a/paddle/fluid/operators/jit/gen/matmul.cc +++ b/paddle/fluid/operators/jit/gen/matmul.cc @@ -14,8 +14,8 @@ #include "paddle/fluid/operators/jit/gen/matmul.h" #include // offsetof +#include #include - #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -98,7 +98,7 @@ void MatMulJitCode::genCode() { class MatMulCreator : public JitCodeCreator { public: - bool UseMe(const matmul_attr_t& attr) const override { + bool CanBeUsed(const matmul_attr_t& attr) const override { return attr.m == 1 && platform::MayIUse(platform::avx512f) && attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512; } diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc index 530d24ee1fb7d9da84102641e1d4d2ab08ab1860..d9e5904add4486ddf126093865f7e0571c1909e4 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.cc +++ b/paddle/fluid/operators/jit/gen/seqpool.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/seqpool.h" +#include #include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -57,7 +58,7 @@ void SeqPoolJitCode::genCode() { class SeqPoolCreator : public JitCodeCreator { public: - bool UseMe(const seq_pool_attr_t& attr) const override { + bool CanBeUsed(const seq_pool_attr_t& attr) const override { return platform::MayIUse(platform::avx); } size_t CodeSize(const seq_pool_attr_t& attr) const override { diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h index 4108ee2f46433f6dc846cbdd3a8f8f9b15cc0c67..e909bc7c7939ee5cb7a2d367c7a452b96e6a91c2 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.h +++ b/paddle/fluid/operators/jit/gen/seqpool.h @@ -32,7 +32,7 @@ class SeqPoolJitCode : public JitCode { : JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) { if (!(type_ == SeqPoolType::kSum || type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt)) { - LOG(FATAL) << "Only support sum pool yet "; + LOG(FATAL) << "Only supported pool type: sum, avg and sqrt."; } fp_h_[0] = 1.f; this->genCode(); diff --git a/paddle/fluid/operators/jit/gen/sgd.cc b/paddle/fluid/operators/jit/gen/sgd.cc new file mode 100644 index 0000000000000000000000000000000000000000..e65d3500b496c811b2da39752417ce5ef3ab3914 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/sgd.cc @@ -0,0 +1,131 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/sgd.h" +#include // offsetof +#include +#include +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void SgdJitCode::genCode() { + preCode(); + constexpr int block = YMM_FLOAT_BLOCK; + constexpr int max_num_regs = 7; + const int num_block = w_ / block; + const int num_groups = num_block / max_num_regs; + const size_t block_size = sizeof(float) * block; + const size_t width_size = w_ * sizeof(float); + std::vector groups(num_groups, max_num_regs); + int rest_num_regs = num_block % max_num_regs; + if (rest_num_regs > 0) { + groups.push_back(rest_num_regs); + } + + vbroadcastss(ymm_lr, ptr[param_lr]); + // protect rdx + mov(reg_ptr_grad_i, param_grad); + mov(reg_ptr_rows_i, param_rows); + + mov(reg_rows_size_in_byte, + qword[param_attr + offsetof(sgd_attr_t, selected_rows_size)]); + mov(rax, sizeof(int64_t)); + mul(reg_rows_size_in_byte); + mov(reg_rows_size_in_byte, rax); + add(reg_rows_size_in_byte, reg_ptr_rows_i); + + Label l_next_row; + L(l_next_row); + { + mov(reg_row, qword[reg_ptr_rows_i]); + mov(rax, width_size); + mul(reg_row); + mov(reg_row, rax); + + mov(reg_ptr_param_i, param_param); + mov(reg_ptr_out_i, param_out); + add(reg_ptr_param_i, reg_row); + add(reg_ptr_out_i, reg_row); + + size_t w_offset = 0; + for (int num_regs : groups) { + // load grad + size_t inner_offfset = w_offset; + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmovups(ymm_t(reg_i), ptr[reg_ptr_grad_i + inner_offfset]); + inner_offfset += block_size; + } + + // load param + inner_offfset = w_offset; + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_param_i + inner_offfset]); + inner_offfset += block_size; + } + + // compute out + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmulps(ymm_t(reg_i), ymm_t(reg_i), ymm_lr); + vsubps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs), ymm_t(reg_i)); + } + + // save out + inner_offfset = w_offset; + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmovups(ptr[reg_ptr_out_i + inner_offfset], ymm_t(reg_i + num_regs)); + inner_offfset += block_size; + } + w_offset += (block_size * num_regs); + } + + add(reg_ptr_grad_i, width_size); + add(reg_ptr_rows_i, sizeof(int64_t)); + cmp(reg_ptr_rows_i, reg_rows_size_in_byte); + jl(l_next_row, T_NEAR); + } + + postCode(); +} + +class SgdCreator : public JitCodeCreator { + public: + bool CanBeUsed(const sgd_attr_t& attr) const override { + return platform::MayIUse(platform::avx) && + attr.grad_width % YMM_FLOAT_BLOCK == 0; + } + size_t CodeSize(const sgd_attr_t& attr) const override { + return 96 + (attr.grad_width / YMM_FLOAT_BLOCK) * 32 * 8; + } + std::unique_ptr CreateJitCode( + const sgd_attr_t& attr) const override { + PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width); + PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height); + PADDLE_ENFORCE_GE(attr.selected_rows_size, 0); + return make_unique(attr, CodeSize(attr)); + } +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(kSgd, gen::SgdCreator); diff --git a/paddle/fluid/operators/jit/gen/sgd.h b/paddle/fluid/operators/jit/gen/sgd.h new file mode 100644 index 0000000000000000000000000000000000000000..317edcd2bcb5fea1f14f32260fd16c9c706eaf00 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/sgd.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +class SgdJitCode : public JitCode { + public: + explicit SgdJitCode(const sgd_attr_t& attr, size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), w_(attr.grad_width) { + this->genCode(); + } + + DECLARE_JIT_CODE(SgdJitCode); + void genCode() override; + + private: + int w_; + reg64_t param_lr{abi_param1}; + reg64_t param_param{abi_param2}; + reg64_t param_grad{abi_param3}; + reg64_t param_rows{abi_param4}; + reg64_t param_out{abi_param5}; + reg64_t param_attr{abi_param6}; + + ymm_t ymm_lr = ymm_t(15); + + reg64_t reg_ptr_grad_i{r10}; + reg64_t reg_ptr_rows_i{r11}; + reg64_t reg_rows_size_in_byte{r12}; + reg64_t reg_row{r13}; + reg64_t reg_ptr_param_i{r14}; + reg64_t reg_ptr_out_i{r15}; +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.cc b/paddle/fluid/operators/jit/gen/vbroadcast.cc new file mode 100644 index 0000000000000000000000000000000000000000..66a8d75fd4de5bae3ba37cf7fe7b1645938aa855 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/vbroadcast.h" +#include +#include +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void VBroadcastJitCode::genCode() { + preCode(); + constexpr int block = YMM_FLOAT_BLOCK; + constexpr int max_num_regs = 16; + const int num_block = w_ / block; + const int num_groups = num_block / max_num_regs; + const size_t block_size = sizeof(float) * block; + std::vector groups(num_groups, max_num_regs); + int rest_num_regs = num_block % max_num_regs; + if (rest_num_regs > 0) { + groups.push_back(rest_num_regs); + } + + // protect param_h + mov(reg_height, param_h); + Label l_next_h; + xor_(reg_h_i, reg_h_i); + mov(reg_ptr_dst_i, param_dst); + L(l_next_h); + { + mov(reg_ptr_src_i, param_src); + for (int num_regs : groups) { + size_t w_offset = 0; + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmovups(ymm_t(reg_i), ptr[reg_ptr_src_i + w_offset]); + w_offset += block_size; + } + add(reg_ptr_src_i, num_regs * block_size); + + w_offset = 0; + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i)); + w_offset += block_size; + } + add(reg_ptr_dst_i, num_regs * block_size); + } // end of groups + inc(reg_h_i); + cmp(reg_h_i, reg_height); + jl(l_next_h, T_NEAR); + } // end of l_next_h + + postCode(); +} + +class VBroadcastCreator : public JitCodeCreator { + public: + bool CanBeUsed(const int64_t& w) const override { + return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0; + } + size_t CodeSize(const int64_t& w) const override { + return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8; + } + std::unique_ptr CreateJitCode(const int64_t& w) const override { + PADDLE_ENFORCE_GT(w, 0); + return make_unique(w, CodeSize(w)); + } +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator); diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.h b/paddle/fluid/operators/jit/gen/vbroadcast.h new file mode 100644 index 0000000000000000000000000000000000000000..27c75f6f710e9514c7d91181e7f447d9dd997081 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/vbroadcast.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +class VBroadcastJitCode : public JitCode { + public: + explicit VBroadcastJitCode(const int64_t& w, size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), w_(w) { + this->genCode(); + } + + DECLARE_JIT_CODE(VBroadcastJitCode); + void genCode() override; + + private: + int w_; + reg64_t param_src{abi_param1}; + reg64_t param_dst{abi_param2}; + reg64_t param_h{abi_param3}; + reg64_t param_w{abi_param4}; + + reg64_t reg_height{r9}; + reg64_t reg_h_i{r10}; + reg64_t reg_ptr_src_i{r11}; + reg64_t reg_ptr_dst_i{r12}; +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc index f3603875ad7bda1fc688f9c053e0d37f7bb31f02..4c49eff49e3efc0664a084f9fa2bb897db0c6f1d 100644 --- a/paddle/fluid/operators/jit/gen_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -31,7 +31,7 @@ namespace paddle { namespace operators { namespace jit { -// refer do not need useme, it would be the last one. +// refer do not need CanBeUsed, it would be the last one. void GenBase::dumpCode(const unsigned char* code) const { if (code) { static int counter = 0; diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h index a7c7a35a7ea35bd80333b04f001d4ab5b5d1e06b..033c603c07c288ba621ceaa912ea0c476fe86cd6 100644 --- a/paddle/fluid/operators/jit/gen_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -31,9 +31,10 @@ class GenBase : public Kernel { virtual ~GenBase() = default; virtual std::string name() const = 0; virtual size_t getSize() const = 0; - virtual const unsigned char* getCodeInternal() = 0; + virtual const unsigned char* getCodeInternal() const = 0; + const char* ImplType() const override { return "JitCode"; } template - Func getCode() { + Func getCode() const { const unsigned char* code = this->getCodeInternal(); if (FLAGS_dump_jitcode) { this->dumpCode(code); @@ -65,7 +66,7 @@ class JitCodeCreator : public GenCreator { virtual ~JitCodeCreator() = default; // condition when this jit code can be used. - virtual bool UseMe(const Attr& attr) const = 0; + virtual bool CanBeUsed(const Attr& attr) const = 0; // estimate this code size virtual size_t CodeSize(const Attr& attr) const = 0; diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index e7292fe2bd8031aa5bbff68e7c2305a238085bf1..eb1c410b6f9a31c3f97a274c5e5ff55bf1c32ea0 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -36,6 +36,8 @@ const char* to_string(KernelType kt) { ONE_CASE(kVScal); ONE_CASE(kVAddBias); ONE_CASE(kVRelu); + ONE_CASE(kVBroadcast); + ONE_CASE(kVCopy); ONE_CASE(kVIdentity); ONE_CASE(kVExp); ONE_CASE(kVSquare); @@ -54,6 +56,8 @@ const char* to_string(KernelType kt) { ONE_CASE(kHMax); ONE_CASE(kHSum); ONE_CASE(kSoftmax); + ONE_CASE(kEmbSeqPool); + ONE_CASE(kSgd); default: PADDLE_THROW("Not support type: %d, or forget to add it.", kt); return "NOT JITKernel"; diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index d5773d65940127ea0a9b77ed2760bd371b778f4c..1ac5318d461c2e8bc4f43569602a88f95a76befb 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -16,6 +16,8 @@ #include #include +#include +#include // for std::move #include #include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/operators/jit/kernel_base.h" @@ -27,35 +29,34 @@ namespace paddle { namespace operators { namespace jit { -template +template inline typename std::enable_if< - std::is_same::value && + std::is_same::value && std::is_same::value, - typename KernelTuples::func_type>::type -GetJitCode(const typename KernelTuples::attr_type& attr) { - using Func = typename KernelTuples::func_type; - using Attr = typename KernelTuples::attr_type; - size_t key = JitCodeKey(attr); - auto& codes = JitCodePool().Instance(); + const Kernel*>::type +GetJitCode(const typename KernelTuple::attr_type& attr) { + using Attr = typename KernelTuple::attr_type; + int64_t key = JitCodeKey(attr); + auto& codes = JitCodePool::Instance(); if (codes.Has(key)) { - return codes.AllKernels().at(key)->template getCode(); + return codes.AllKernels().at(key).get(); } // creator is not related with attr, so can use KernelKey as key - KernelKey kkey(KT, PlaceType()); + KernelKey kkey(KernelTuple::kernel_type, PlaceType()); // pool: (KernelKey(type, place), vector) - auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); + auto& creator_map = JitCodeCreatorPool::Instance().AllCreators(); auto iter = creator_map.find(kkey); if (iter != creator_map.end()) { auto& creators = iter->second; for (auto& cur : creators) { auto i = dynamic_cast*>(cur.get()); - if (i && i->UseMe(attr)) { + if (i && i->CanBeUsed(attr)) { auto p = i->CreateJitCode(attr); if (p) { - auto f = p->template getCode(); + auto res = p.get(); codes.Insert(key, std::move(p)); - return f; + return res; } } } @@ -63,87 +64,153 @@ GetJitCode(const typename KernelTuples::attr_type& attr) { return nullptr; } -template +template inline typename std::enable_if< - !std::is_same::value || + !std::is_same::value || !std::is_same::value, - typename KernelTuples::func_type>::type -GetJitCode(const typename KernelTuples::attr_type& attr) { + const Kernel*>::type +GetJitCode(const typename KernelTuple::attr_type& attr) { return nullptr; } // Refer code do not related with attr, which is just for cast // Refer is always on CPUPlace -template -inline typename KernelTuples::func_type GetRefer() { - auto& ref_pool = ReferKernelPool().Instance().AllKernels(); - KernelKey kkey(KT, platform::CPUPlace()); +template +inline const Kernel* GetReferKernel() { + auto& ref_pool = ReferKernelPool::Instance().AllKernels(); + KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace()); auto ref_iter = ref_pool.find(kkey); PADDLE_ENFORCE(ref_iter != ref_pool.end(), "Every Kernel should have reference function."); auto& ref_impls = ref_iter->second; for (auto& impl : ref_impls) { - auto i = dynamic_cast*>(impl.get()); + auto i = dynamic_cast*>(impl.get()); if (i) { - return i->GetFunc(); + return i; } } return nullptr; } -template -typename KernelTuples::func_type Get( - const typename KernelTuples::attr_type& attr) { - auto jitfunc = GetJitCode(attr); - if (jitfunc) { - return jitfunc; +template +inline typename KernelTuple::func_type GetReferFunc() { + auto ker = GetReferKernel(); + auto p = dynamic_cast*>(ker); + PADDLE_ENFORCE(p, "The Refer kernel should exsit"); + return p->GetFunc(); +} + +// Return all Kernels that can be used +template +std::vector GetAllCandidateKernels( + const typename KernelTuple::attr_type& attr) { + // the search order shoudl be jitcode > more > refer + std::vector res; + auto jitker = GetJitCode(attr); + if (jitker) { + res.emplace_back(jitker); } - // pool: (KernelKey(type, place), vector) - KernelKey kkey(KT, PlaceType()); - auto& pool = KernelPool().Instance().AllKernels(); + // more kernelpool: (KernelKey(type, place), vector) + KernelKey kkey(KernelTuple::kernel_type, PlaceType()); + auto& pool = KernelPool::Instance().AllKernels(); auto iter = pool.find(kkey); if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); - if (i && i->UseMe(attr)) { - return i->GetFunc(); + auto i = dynamic_cast*>(impl.get()); + if (i && i->CanBeUsed(attr)) { + res.emplace_back(i); } } } // The last implementation should be reference function on CPUPlace. - return GetRefer(); + auto ref = GetReferKernel(); + PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty."); + res.emplace_back(ref); + return res; } -template +template +std::vector> +GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) { + using Func = typename KernelTuple::func_type; + auto kers = GetAllCandidateKernels(attr); + std::vector> res; + for (auto k : kers) { + std::string name = k->ImplType(); + if (name == "JitCode") { + auto i = dynamic_cast(k); + PADDLE_ENFORCE(i, "jitcode kernel cast can not fail."); + res.emplace_back(std::make_pair(name, i->template getCode())); + } else { + auto i = dynamic_cast*>(k); + PADDLE_ENFORCE(i, "kernel cast can not fail."); + res.emplace_back(std::make_pair(name, i->GetFunc())); + } + } + return res; +} + +template +std::vector GetAllCandidateFuncs( + const typename KernelTuple::attr_type& attr) { + auto funcs = GetAllCandidateFuncsWithTypes(attr); + std::vector res; + for (auto& i : funcs) { + res.emplace_back(i.second); + } + return res; +} + +template +typename KernelTuple::func_type GetDefaultBestFunc( + const typename KernelTuple::attr_type& attr) { + auto funcs = GetAllCandidateFuncs(attr); + PADDLE_ENFORCE_GE(funcs.size(), 1UL); + // Here could do some runtime benchmark of this attr and return the best one. + // But yet just get the first one as the default best one, + // which is searched in order and tuned by offline. + return funcs[0]; +} + +template class KernelFuncs { public: KernelFuncs() = default; static KernelFuncs& Cache() { - static thread_local KernelFuncs g_func_cache; + static thread_local KernelFuncs g_func_cache; return g_func_cache; } - bool Has(int key) const { return funcs_.find(key) != funcs_.end(); } - - void Insert(int key, typename KernelTuples::func_type func) { - funcs_.emplace(key, func); - } - - typename KernelTuples::func_type At(int key) { + // the exposed interface to use + typename KernelTuple::func_type At( + const typename KernelTuple::attr_type& attr) { + // Maybe here is not good enough, not all kernels should have jitcode + int64_t key = JitCodeKey(attr); if (Has(key)) { return funcs_.at(key); } - auto func = Get(key); + // If do not have this attr in cache then get the default best + auto func = GetDefaultBestFunc(attr); Insert(key, func); return func; } + typename KernelTuple::func_type operator[]( + const typename KernelTuple::attr_type& attr) { + return At(attr); + } + + protected: + bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); } + void Insert(int64_t key, typename KernelTuple::func_type func) { + funcs_.emplace(key, func); + } + private: - std::unordered_map funcs_; + std::unordered_map funcs_; DISABLE_COPY_AND_ASSIGN(KernelFuncs); }; @@ -172,6 +239,23 @@ inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) { return os; } +inline std::ostream& operator<<(std::ostream& os, + const emb_seq_pool_attr_t& attr) { + os << "table_height[" << attr.table_height << "],table_width[" + << attr.table_width << "],index_height[" << attr.index_height + << "],index_width[" << attr.index_width << "],output_width[" + << attr.out_width << "],pool_type[" << to_string(attr.pool_type) << "]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const sgd_attr_t& attr) { + os << "param_height[" << attr.param_height << "],param_width[" + << attr.param_width << "],grad_height[" << attr.grad_height + << "],grad_width[" << attr.grad_width << "],selected_rows_size[" + << attr.selected_rows_size << "]"; + return os; +} + inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) { os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]"; return os; diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 4a8f61146a1921fa1d5f6b7e15af40cd45d31a22..bd34d7dfc72a139e70983c56c3220bd01d572bcd 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -13,6 +13,7 @@ * limitations under the License. */ #pragma once +#include #include "paddle/fluid/operators/jit/macro.h" #include "paddle/fluid/platform/macros.h" @@ -20,34 +21,38 @@ namespace paddle { namespace operators { namespace jit { -// TODO(TJ): reorder by alphabet typedef enum { kNone = 0, - kVMul = 1, - kVAdd = 2, - kVAddRelu, - kVSub, - kVScal, - kVAddBias, - kVRelu, - kVIdentity, - kVSquare, - kVExp, - kVSigmoid, - kVTanh, - kLSTMCtHt, - kLSTMC1H1, + // sort by alphabet + kCRFDecoding = 1, + kEmbSeqPool = 2, kGRUH1, kGRUHtPart1, kGRUHtPart2, - kCRFDecoding, + kHSum, // horizontal max + kHMax, // horizontal sum + kLSTMCtHt, + kLSTMC1H1, kLayerNorm, + kMatMul, kNCHW16CMulNC, kSeqPool, - kMatMul, - kHSum, // horizontal max - kHMax, // horizontal sum kSoftmax, + kVAdd, + kVAddBias, + kVAddRelu, + kVBroadcast, + kVCopy, + kVExp, + kVIdentity, + kVMul, + kVRelu, + kVScal, + kSgd, + kVSigmoid, + kVSquare, + kVSub, + kVTanh, } KernelType; typedef enum { @@ -57,26 +62,55 @@ typedef enum { kSqrt, } SeqPoolType; +// x, y, z, n template -struct XYZNTuples { +struct XYZNTuple { typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, const T*, T*, int); }; +// a, x, y, n template -struct AXYNTuples : public XYZNTuples {}; +struct AXYNTuple : public XYZNTuple {}; +// x, y, n template -struct XYNTuples { +struct XYNTuple { typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, T*, int); }; -// x, return and int +// x, returned value, n template -struct XRNTuples : public XYNTuples {}; +struct XRNTuple : public XYNTuple {}; + +#define DECLARE_KERNELTUPLE(kernel_tuple, type) \ + template \ + struct type##Tuple : public kernel_tuple { \ + static constexpr KernelType kernel_type = k##type; \ + } + +// Tuple should be corresponding to the KernelType +DECLARE_KERNELTUPLE(XYZNTuple, VMul); +DECLARE_KERNELTUPLE(XYZNTuple, VAdd); +DECLARE_KERNELTUPLE(XYZNTuple, VAddRelu); +DECLARE_KERNELTUPLE(XYZNTuple, VSub); + +DECLARE_KERNELTUPLE(AXYNTuple, VScal); +DECLARE_KERNELTUPLE(AXYNTuple, VAddBias); + +DECLARE_KERNELTUPLE(XYNTuple, VRelu); +DECLARE_KERNELTUPLE(XYNTuple, VIdentity); +DECLARE_KERNELTUPLE(XYNTuple, VSquare); +DECLARE_KERNELTUPLE(XYNTuple, VExp); +DECLARE_KERNELTUPLE(XYNTuple, VSigmoid); +DECLARE_KERNELTUPLE(XYNTuple, VTanh); +DECLARE_KERNELTUPLE(XYNTuple, VCopy); + +DECLARE_KERNELTUPLE(XRNTuple, HMax); +DECLARE_KERNELTUPLE(XRNTuple, HSum); typedef struct { void* gates; // gates: x_ch, x_ih, x_fh, x_oh @@ -117,19 +151,36 @@ typedef struct rnn_attr_s gru_attr_t; typedef struct lstm_attr_s lstm_attr_t; template -struct LSTMTuples { +struct LSTMTuple { typedef T data_type; typedef lstm_attr_t attr_type; typedef void (*func_type)(lstm_t*, const lstm_attr_t*); }; template -struct GRUTuples { +struct GRUTuple { typedef T data_type; typedef gru_attr_t attr_type; typedef void (*func_type)(gru_t*, const gru_attr_t*); }; +DECLARE_KERNELTUPLE(LSTMTuple, LSTMCtHt); +DECLARE_KERNELTUPLE(LSTMTuple, LSTMC1H1); + +DECLARE_KERNELTUPLE(GRUTuple, GRUH1); +DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart1); +DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart2); + +#undef DECLARE_KERNELTUPLE + +template +struct VBroadcastTuple { + static constexpr KernelType kernel_type = kVBroadcast; + typedef T data_type; + typedef int64_t attr_type; + typedef void (*func_type)(const T*, T*, int64_t, int64_t); +}; + typedef struct seq_pool_attr_s { int h, w; // h should always be the first one SeqPoolType type; @@ -139,12 +190,63 @@ typedef struct seq_pool_attr_s { } seq_pool_attr_t; template -struct SeqPoolTuples { +struct SeqPoolTuple { + static constexpr KernelType kernel_type = kSeqPool; typedef T data_type; typedef seq_pool_attr_t attr_type; typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*); }; +typedef struct emb_seq_pool_attr_s { + int64_t table_height, table_width; + int64_t index_height, index_width; + int64_t out_width; + SeqPoolType pool_type; + emb_seq_pool_attr_s() = default; + explicit emb_seq_pool_attr_s(int64_t tbl_height, int64_t tbl_width, + int64_t idx_height, int64_t idx_width, + int64_t output_width, + SeqPoolType seqpool_type = SeqPoolType::kSum) + : table_height(tbl_height), + table_width(tbl_width), + index_height(idx_height), + index_width(idx_width), + out_width(output_width), + pool_type(seqpool_type) {} +} emb_seq_pool_attr_t; + +template +struct EmbSeqPoolTuple { + static constexpr KernelType kernel_type = kEmbSeqPool; + typedef T data_type; + typedef emb_seq_pool_attr_t attr_type; + typedef void (*func_type)(const T*, const int64_t*, T*, + const emb_seq_pool_attr_t*); +}; + +typedef struct sgd_attr_s { + int64_t param_height, param_width; + int64_t grad_height, grad_width; + int64_t selected_rows_size; + sgd_attr_s() = default; + explicit sgd_attr_s(int64_t param_h, int64_t param_w, int64_t grad_h, + int64_t grad_w, int64_t selected_rows_sz) + : param_height(param_h), + param_width(param_w), + grad_height(grad_h), + grad_width(grad_w), + selected_rows_size(selected_rows_sz) {} +} sgd_attr_t; + +template +struct SgdTuple { + static constexpr KernelType kernel_type = kSgd; + typedef T data_type; + typedef sgd_attr_t attr_type; + typedef void (*func_type)(const T*, const T*, const T*, const int64_t*, T*, + const sgd_attr_t*); +}; + typedef struct matmul_attr_s { int m, n, k; void* packed_weight{nullptr}; @@ -154,21 +256,24 @@ typedef struct matmul_attr_s { } matmul_attr_t; template -struct MatMulTuples { +struct MatMulTuple { + static constexpr KernelType kernel_type = kMatMul; typedef T data_type; typedef matmul_attr_t attr_type; typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*); }; template -struct CRFDecodingTuples { +struct CRFDecodingTuple { + static constexpr KernelType kernel_type = kCRFDecoding; typedef T data_type; typedef int attr_type; typedef void (*func_type)(const int, const T*, const T*, T*, int*, int); }; template -struct LayerNormTuples { +struct LayerNormTuple { + static constexpr KernelType kernel_type = kLayerNorm; typedef T data_type; typedef int attr_type; typedef void (*func_type)(T*, T*, T*, T*, const T*, const T*, int, @@ -176,7 +281,8 @@ struct LayerNormTuples { }; template -struct SoftmaxTuples { +struct SoftmaxTuple { + static constexpr KernelType kernel_type = kSoftmax; typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, T*, int, int); @@ -184,7 +290,8 @@ struct SoftmaxTuples { // nChw16c = nChw16c .* NC template -struct NCHW16CMulNCTuples { +struct NCHW16CMulNCTuple { + static constexpr KernelType kernel_type = kNCHW16CMulNC; typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, const T*, T*, int, int); @@ -195,28 +302,29 @@ class Kernel { public: Kernel() = default; virtual ~Kernel() = default; + virtual const char* ImplType() const = 0; DISABLE_COPY_AND_ASSIGN(Kernel); }; -template +template class KernelMore : public Kernel { public: - using T = typename KernelTuples::data_type; - using Func = typename KernelTuples::func_type; - using Attr = typename KernelTuples::attr_type; + using T = typename KernelTuple::data_type; + using Func = typename KernelTuple::func_type; + using Attr = typename KernelTuple::attr_type; virtual Func GetFunc() const { return func; } - virtual bool UseMe(const Attr& attr) const = 0; - virtual const char* ImplType() const = 0; + // specify this kernel can be used, means it should not fail if use it. + virtual bool CanBeUsed(const Attr& attr) const = 0; protected: Func func{nullptr}; }; -template -class ReferKernel : public KernelMore { +template +class ReferKernel : public KernelMore { public: // Refer code can always be used - bool UseMe(const typename KernelTuples::attr_type& attr) const override { + bool CanBeUsed(const typename KernelTuple::attr_type& attr) const override { return true; } const char* ImplType() const override { return "Refer"; } diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index 1e4a8884e78c5d3c1748988f05ecf461a6f0eb94..1ad220b3972a3d3920610ab8f7ea416892a80d22 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -13,47 +13,55 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/kernel_key.h" +#include // XXH64: 13.8 GB/s +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { namespace jit { template <> -size_t JitCodeKey(const int& d) { +int64_t JitCodeKey(const int& d) { return d; } -constexpr int act_type_shift = 3; // suppot 2^3 act types +template <> +int64_t JitCodeKey(const int64_t& d) { + return d; +} + +template <> +int64_t JitCodeKey(const gru_attr_t& attr) { + return XXH64(&attr, sizeof(gru_attr_t), 0); +} + +template <> +int64_t JitCodeKey(const lstm_attr_t& attr) { + int keys[5] = { + attr.d, static_cast(attr.act_gate), static_cast(attr.act_cand), + static_cast(attr.act_cell), static_cast(attr.use_peephole)}; + return XXH64(keys, sizeof(int) * 5, 0); +} template <> -size_t JitCodeKey(const lstm_attr_t& attr) { - size_t key = attr.d; - int gate_key = static_cast(attr.act_gate) << 1; - int cand_key = static_cast(attr.act_cand) << (1 + act_type_shift); - int cell_key = static_cast(attr.act_cell) << (1 + act_type_shift * 2); - return (key << (1 + act_type_shift * 3)) + gate_key + cand_key + cell_key + - attr.use_peephole; +int64_t JitCodeKey(const seq_pool_attr_t& attr) { + int keys[2] = {attr.w, static_cast(attr.type)}; + return XXH64(keys, sizeof(int) * 2, 0); } template <> -size_t JitCodeKey(const gru_attr_t& attr) { - size_t key = attr.d; - return (key << (act_type_shift * 2)) + static_cast(attr.act_gate) + - (static_cast(attr.act_cand) << act_type_shift); +int64_t JitCodeKey(const matmul_attr_t& attr) { + return XXH64(&attr, sizeof(int) * 3, 0); // m, n, k } template <> -size_t JitCodeKey(const seq_pool_attr_t& attr) { - size_t key = attr.w; - constexpr int pool_type_shift = 3; - return (key << pool_type_shift) + static_cast(attr.type); +int64_t JitCodeKey(const emb_seq_pool_attr_t& attr) { + return attr.table_width; } template <> -size_t JitCodeKey(const matmul_attr_t& attr) { - size_t key = attr.m; - constexpr int shift = 21; - return (key << shift * 2) + ((static_cast(attr.n)) << shift) + attr.k; +int64_t JitCodeKey(const sgd_attr_t& attr) { + return attr.grad_width; } } // namespace jit diff --git a/paddle/fluid/operators/jit/kernel_key.h b/paddle/fluid/operators/jit/kernel_key.h index 611a0210d614196ad0b05d583303688c1d964e04..b2cf92f23e8ccff5fff7c6e193f7118fbb4765f0 100644 --- a/paddle/fluid/operators/jit/kernel_key.h +++ b/paddle/fluid/operators/jit/kernel_key.h @@ -46,7 +46,7 @@ struct KernelKey { // Every JitCode should have a method to get the key from attribution template -size_t JitCodeKey(const Attr& attr); +int64_t JitCodeKey(const Attr& attr); } // namespace jit } // namespace operators diff --git a/paddle/fluid/operators/jit/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h index 3e15242af28839ee0759e1a5b3930d6d6bfaa0ff..04710a54ac9ddf2ecb8f6a1f2ca33ef158d2d73f 100644 --- a/paddle/fluid/operators/jit/kernel_pool.h +++ b/paddle/fluid/operators/jit/kernel_pool.h @@ -17,6 +17,7 @@ #include // for unique_ptr #include #include +#include // for move #include #include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/operators/jit/kernel_base.h" @@ -30,7 +31,7 @@ namespace jit { template class JitCodePool { typedef std::unique_ptr GenBasePtr; - typedef std::unordered_map JitCodeMap; + typedef std::unordered_map JitCodeMap; public: JitCodePool() = default; @@ -41,9 +42,9 @@ class JitCodePool { const JitCodeMap& AllKernels() { return codes_; } - bool Has(size_t key) const { return codes_.find(key) != codes_.end(); } + bool Has(int64_t key) const { return codes_.find(key) != codes_.end(); } - void Insert(size_t key, GenBasePtr value) { + void Insert(int64_t key, GenBasePtr value) { codes_.emplace(key, std::move(value)); } diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc index 16c91f8246dda34b1436fd4edd507e9ff603de6b..1254d00189a315b4f743f48e56b3eb53c92ec694 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc @@ -161,7 +161,7 @@ void CRFDecoding(const int seq_len, const float* x, const float* w, } } -bool CRFDecodingKernel::UseMe(const int& d) const { +bool CRFDecodingKernel::CanBeUsed(const int& d) const { #ifdef __AVX512F__ constexpr int block = ZMM_FLOAT_BLOCK; #else diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h index 24179d90ddcc6e7f44ffa4b2ca0886fbca5c81bf..49b1a1fea4b16f435120bb37c7d9c8c07a4cc4f5 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h @@ -26,11 +26,11 @@ namespace intrinsic { void CRFDecoding(const int seq_len, const float* x, const float* w, float* alpha, int* track, int tag_num); -class CRFDecodingKernel : public KernelMore> { +class CRFDecodingKernel : public KernelMore> { public: CRFDecodingKernel() { this->func = CRFDecoding; } - bool UseMe( - const typename CRFDecodingTuples::attr_type&) const override; + bool CanBeUsed( + const typename CRFDecodingTuple::attr_type&) const override; const char* ImplType() const override { return "Intrinsic"; } }; diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc index e9b6e401c6825b21191881d4e57fe09b48d2f4ee..a4e3246f10495b67871c08fd8cb7ccd1cf085c9e 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc +++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc @@ -153,7 +153,7 @@ void LayerNorm(float* x, float* out, float* mean, float* var, } } -bool LayerNormKernel::UseMe(const int& d) const { +bool LayerNormKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx) && d >= YMM_FLOAT_BLOCK; } diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h index 89da2940f4420c418f9bd5260c4b74606cc9168f..7b9f676050d806314edd1e46611416a8b7170add 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h +++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h @@ -27,10 +27,11 @@ void LayerNorm(float* x, float* out, float* mean, float* var, const float* scale, const float* bias, int height, const float epsilon, int right); -class LayerNormKernel : public KernelMore> { +class LayerNormKernel : public KernelMore> { public: LayerNormKernel() { this->func = LayerNorm; } - bool UseMe(const typename LayerNormTuples::attr_type&) const override; + bool CanBeUsed( + const typename LayerNormTuple::attr_type&) const override; const char* ImplType() const override { return "Intrinsic"; } }; diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index 0036d1c238b17768c4df61af22a85588990e1815..6e709a16d232e2fa1a77e74e228b763fed4dd75b 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -23,6 +23,8 @@ namespace jit { namespace more { namespace mix { +using CPUPlace = platform::CPUPlace; + void VSigmoid(const T* x, T* y, int n) { const float min = SIGMOID_THRESHOLD_MIN; const float max = SIGMOID_THRESHOLD_MAX; @@ -30,7 +32,7 @@ void VSigmoid(const T* x, T* y, int n) { y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); y[i] = static_cast(0) - y[i]; } - auto compute = Get, platform::CPUPlace>(n); + auto compute = KernelFuncs, CPUPlace>::Cache().At(n); compute(y, y, n); for (int i = 0; i < n; ++i) { y[i] = static_cast(1) / (static_cast(1) + y[i]); @@ -39,9 +41,9 @@ void VSigmoid(const T* x, T* y, int n) { void VTanh(const T* x, T* y, int n) { const T a = 2, b = -1; - auto compute_scal = Get, platform::CPUPlace>(n); - auto compute_addbias = Get, platform::CPUPlace>(n); - auto compute_sigmoid = Get, platform::CPUPlace>(n); + auto compute_scal = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_addbias = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_sigmoid = KernelFuncs, CPUPlace>::Cache().At(n); compute_scal(&a, x, y, n); compute_sigmoid(y, y, n); compute_scal(&a, y, y, n); @@ -49,16 +51,12 @@ void VTanh(const T* x, T* y, int n) { } void Softmax(const T* x, T* y, int n, int bs) { - auto compute_hmax = - KernelFuncs, platform::CPUPlace>::Cache().At(n); - auto compute_hsum = - KernelFuncs, platform::CPUPlace>::Cache().At(n); - auto compute_vscal = - KernelFuncs, platform::CPUPlace>::Cache().At(n); + auto compute_hmax = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_hsum = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_vscal = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_vaddbias = - KernelFuncs, platform::CPUPlace>::Cache().At(n); - auto compute_vexp = - KernelFuncs, platform::CPUPlace>::Cache().At(n); + KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_vexp = KernelFuncs, CPUPlace>::Cache().At(n); for (int i = 0; i < bs; ++i) { T scalar; @@ -76,13 +74,13 @@ void Softmax(const T* x, T* y, int n, int bs) { void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT if (type == kVSigmoid) { - return Get, platform::CPUPlace>(d); + return KernelFuncs, CPUPlace>::Cache().At(d); } else if (type == kVRelu) { - return Get, platform::CPUPlace>(d); + return KernelFuncs, CPUPlace>::Cache().At(d); } else if (type == kVTanh) { - return Get, platform::CPUPlace>(d); + return KernelFuncs, CPUPlace>::Cache().At(d); } else if (type == kVIdentity) { - return Get, platform::CPUPlace>(d); + return KernelFuncs, CPUPlace>::Cache().At(d); } PADDLE_THROW("Not support type: %s", type); return nullptr; @@ -98,9 +96,9 @@ void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) { const int d = attr->d; const int d2 = d * 2; const int d3 = d * 3; - auto vmul_d = Get, platform::CPUPlace>(d); - auto vadd_d = Get, platform::CPUPlace>(d); - auto vadd_d2 = Get, platform::CPUPlace>(d2); + auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(d); + auto vadd_d = KernelFuncs, CPUPlace>::Cache().At(d); + auto vadd_d2 = KernelFuncs, CPUPlace>::Cache().At(d2); auto act_gate_d = getActFunc(attr->act_gate, d); auto act_gate_d2 = getActFunc(attr->act_gate, d2); auto act_gate_d3 = getActFunc(attr->act_gate, d3); @@ -140,8 +138,8 @@ void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { int d = attr->d; int d2 = d * 2; int d3 = d * 3; - auto vmul_d = Get, platform::CPUPlace>(d); - auto vadd_d = Get, platform::CPUPlace>(d); + auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(d); + auto vadd_d = KernelFuncs, CPUPlace>::Cache().At(d); auto act_gate_d = getActFunc(attr->act_gate, d); auto act_cand_d = getActFunc(attr->act_cand, d); auto act_cell_d = getActFunc(attr->act_cell, d); @@ -169,7 +167,7 @@ void GRUH1(gru_t* step, const gru_attr_t* attr) { int d2 = d * 2; auto act_gate = getActFunc(attr->act_gate, d); auto act_cand = getActFunc(attr->act_cand, d); - auto vmul_d = Get, platform::CPUPlace>(d); + auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(d); act_gate(gates, gates, d); act_cand(gates + d2, gates + d2, d); vmul_d(gates, gates + d2, ht, d); @@ -182,7 +180,7 @@ void GRUHtPart1(gru_t* step, const gru_attr_t* attr) { T* ht = reinterpret_cast(step->ht); const T* ht_1 = reinterpret_cast(step->ht_1); auto act_gate = getActFunc(attr->act_gate, attr->d); - auto vmul_d = Get, platform::CPUPlace>(attr->d); + auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(attr->d); act_gate(gates + attr->d, gates + attr->d, attr->d); vmul_d(ht_1, gates + attr->d, ht, attr->d); } @@ -206,21 +204,21 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { } // TODO(TJ): tuning me -bool VSigmoidKernel::UseMe(const int& d) const { return true; } +bool VSigmoidKernel::CanBeUsed(const int& d) const { return true; } -bool VTanhKernel::UseMe(const int& d) const { return true; } +bool VTanhKernel::CanBeUsed(const int& d) const { return true; } -bool SoftmaxKernel::UseMe(const int& d) const { return true; } +bool SoftmaxKernel::CanBeUsed(const int& d) const { return true; } -bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; } +bool LSTMCtHtKernel::CanBeUsed(const lstm_attr_t& attr) const { return true; } -bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; } +bool LSTMC1H1Kernel::CanBeUsed(const lstm_attr_t& attr) const { return true; } -bool GRUH1Kernel::UseMe(const gru_attr_t& attr) const { return true; } +bool GRUH1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; } -bool GRUHtPart1Kernel::UseMe(const gru_attr_t& attr) const { return true; } +bool GRUHtPart1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; } -bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; } +bool GRUHtPart2Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; } } // namespace mix } // namespace more @@ -230,16 +228,16 @@ bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; } namespace mix = paddle::operators::jit::more::mix; -#define REGISTER_MORE_KERNEL(key, func) \ - REGISTER_JITKERNEL_MORE(key, mix, mix::func##Kernel) - -REGISTER_MORE_KERNEL(kVSigmoid, VSigmoid); -REGISTER_MORE_KERNEL(kVTanh, VTanh); -REGISTER_MORE_KERNEL(kSoftmax, Softmax); -REGISTER_MORE_KERNEL(kLSTMCtHt, LSTMCtHt); -REGISTER_MORE_KERNEL(kLSTMC1H1, LSTMC1H1); -REGISTER_MORE_KERNEL(kGRUH1, GRUH1); -REGISTER_MORE_KERNEL(kGRUHtPart1, GRUHtPart1); -REGISTER_MORE_KERNEL(kGRUHtPart2, GRUHtPart2); +#define REGISTER_MORE_KERNEL(func) \ + REGISTER_JITKERNEL_MORE(k##func, mix, mix::func##Kernel) + +REGISTER_MORE_KERNEL(VSigmoid); +REGISTER_MORE_KERNEL(VTanh); +REGISTER_MORE_KERNEL(Softmax); +REGISTER_MORE_KERNEL(LSTMCtHt); +REGISTER_MORE_KERNEL(LSTMC1H1); +REGISTER_MORE_KERNEL(GRUH1); +REGISTER_MORE_KERNEL(GRUHtPart1); +REGISTER_MORE_KERNEL(GRUHtPart2); #undef REGISTER_MORE_KERNEL diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h index d64af192197a0b339a39a1862c028875da2f3900..994d485909c874a8a15418ad946c79a10265c748 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.h +++ b/paddle/fluid/operators/jit/more/mix/mix.h @@ -34,27 +34,27 @@ void GRUH1(gru_t* step, const gru_attr_t* attr); void GRUHtPart1(gru_t* step, const gru_attr_t* attr); void GRUHtPart2(gru_t* step, const gru_attr_t* attr); -#define DECLARE_MORE_KERNEL(name, tuples) \ - class name##Kernel : public KernelMore> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool UseMe(const typename tuples::attr_type&) const override; \ - const char* ImplType() const override { return "Mixed"; } \ +#define DECLARE_MORE_KERNEL(name) \ + class name##Kernel : public KernelMore> { \ + public: \ + name##Kernel() { this->func = name; } \ + bool CanBeUsed(const typename name##Tuple::attr_type&) const override; \ + const char* ImplType() const override { return "Mixed"; } \ } // XYN -DECLARE_MORE_KERNEL(VSigmoid, XYNTuples); -DECLARE_MORE_KERNEL(VTanh, XYNTuples); +DECLARE_MORE_KERNEL(VSigmoid); +DECLARE_MORE_KERNEL(VTanh); // XRN -DECLARE_MORE_KERNEL(Softmax, SoftmaxTuples); +DECLARE_MORE_KERNEL(Softmax); -DECLARE_MORE_KERNEL(LSTMCtHt, LSTMTuples); -DECLARE_MORE_KERNEL(LSTMC1H1, LSTMTuples); +DECLARE_MORE_KERNEL(LSTMCtHt); +DECLARE_MORE_KERNEL(LSTMC1H1); -DECLARE_MORE_KERNEL(GRUH1, GRUTuples); -DECLARE_MORE_KERNEL(GRUHtPart1, GRUTuples); -DECLARE_MORE_KERNEL(GRUHtPart2, GRUTuples); +DECLARE_MORE_KERNEL(GRUH1); +DECLARE_MORE_KERNEL(GRUHtPart1); +DECLARE_MORE_KERNEL(GRUHtPart2); #undef DECLARE_MORE_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt index f9e5aea32e7cd48e9b39c4c3ee0e30f4a5c84f6f..f69417c370b653d93cce04a2248ad809168670da 100644 --- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt @@ -9,7 +9,11 @@ USE_JITKERNEL_MORE(kVAdd, mkl) USE_JITKERNEL_MORE(kVScal, mkl) USE_JITKERNEL_MORE(kVExp, mkl) USE_JITKERNEL_MORE(kVSquare, mkl) +USE_JITKERNEL_MORE(kVCopy, mkl) USE_JITKERNEL_MORE(kVSigmoid, mkl) USE_JITKERNEL_MORE(kVTanh, mkl) USE_JITKERNEL_MORE(kSeqPool, mkl) USE_JITKERNEL_MORE(kSoftmax, mkl) +USE_JITKERNEL_MORE(kEmbSeqPool, mkl) +USE_JITKERNEL_MORE(kSgd, mkl) +USE_JITKERNEL_MORE(kVBroadcast, mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 4c999131ab116ebe3484355158993558b02cc4b2..4f600b38144f53798e3d4c66264fc5bfa671a4f7 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -130,70 +130,106 @@ void ASum(const double* x, double* res, int n) { // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 template <> -bool VMulKernel::UseMe(const int& d) const { +bool VMulKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx512f) && d > 512; } template <> -bool VAddKernel::UseMe(const int& d) const { +bool VAddKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx) && d > 512; } template <> -bool VScalKernel::UseMe(const int& d) const { +bool VScalKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx512f) && d > 512; } template <> -bool VExpKernel::UseMe(const int& d) const { +bool VExpKernel::CanBeUsed(const int& d) const { return d > 7; } template <> -bool VSquareKernel::UseMe(const int& d) const { +bool VSquareKernel::CanBeUsed(const int& d) const { return d > 7; } template <> -bool VSigmoidKernel::UseMe(const int& d) const { +bool VCopyKernel::CanBeUsed(const int& d) const { + return d > 15; +} + +template <> +bool VBroadcastKernel::CanBeUsed(const int64_t& d) const { + return d > 127; +} + +template <> +bool VBroadcastKernel::CanBeUsed(const int64_t& attr) const { + return true; +} + +template <> +bool VSigmoidKernel::CanBeUsed(const int& d) const { return d > 7; } template <> -bool VTanhKernel::UseMe(const int& d) const { +bool VTanhKernel::CanBeUsed(const int& d) const { return d > 7; } template <> -bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { +bool SeqPoolKernel::CanBeUsed(const seq_pool_attr_t& attr) const { + return true; +} + +template <> +bool SeqPoolKernel::CanBeUsed(const seq_pool_attr_t& attr) const { + return true; +} + +template <> +bool EmbSeqPoolKernel::CanBeUsed(const emb_seq_pool_attr_t& attr) const { + return true; +} + +template <> +bool EmbSeqPoolKernel::CanBeUsed( + const emb_seq_pool_attr_t& attr) const { + return true; +} + +template <> +bool SgdKernel::CanBeUsed(const sgd_attr_t& attr) const { return true; } template <> -bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { +bool SgdKernel::CanBeUsed(const sgd_attr_t& attr) const { return true; } template <> -bool MatMulKernel::UseMe(const matmul_attr_t& attr) const { +bool MatMulKernel::CanBeUsed(const matmul_attr_t& attr) const { return platform::MayIUse(platform::avx); } template <> -bool MatMulKernel::UseMe(const matmul_attr_t& attr) const { +bool MatMulKernel::CanBeUsed(const matmul_attr_t& attr) const { return true; } template <> -bool SoftmaxKernel::UseMe(const int& d) const { +bool SoftmaxKernel::CanBeUsed(const int& d) const { // tuned on avx2 return platform::MayIUse(platform::avx) && d < 60; } -#define AWALYS_USE_ME_WITH_DOUBLE(func) \ - template <> \ - bool func##Kernel::UseMe(const int& d) const { \ - return true; \ +#define AWALYS_USE_ME_WITH_DOUBLE(func) \ + template <> \ + bool func##Kernel::CanBeUsed(const int& d) const { \ + return true; \ } AWALYS_USE_ME_WITH_DOUBLE(VMul); @@ -203,6 +239,7 @@ AWALYS_USE_ME_WITH_DOUBLE(VExp); AWALYS_USE_ME_WITH_DOUBLE(VSigmoid); AWALYS_USE_ME_WITH_DOUBLE(VTanh); AWALYS_USE_ME_WITH_DOUBLE(VSquare); +AWALYS_USE_ME_WITH_DOUBLE(VCopy); AWALYS_USE_ME_WITH_DOUBLE(Softmax); #undef AWALYS_USE_ME_WITH_DOUBLE @@ -214,19 +251,23 @@ AWALYS_USE_ME_WITH_DOUBLE(Softmax); namespace mkl = paddle::operators::jit::more::mkl; -#define REGISTER_MKL_KERNEL(key, func) \ - REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel, \ +#define REGISTER_MKL_KERNEL(func) \ + REGISTER_JITKERNEL_MORE(k##func, mkl, mkl::func##Kernel, \ mkl::func##Kernel) -REGISTER_MKL_KERNEL(kMatMul, MatMul); -REGISTER_MKL_KERNEL(kVMul, VMul); -REGISTER_MKL_KERNEL(kVAdd, VAdd); -REGISTER_MKL_KERNEL(kVScal, VScal); -REGISTER_MKL_KERNEL(kVExp, VExp); -REGISTER_MKL_KERNEL(kVSquare, VSquare); -REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid); -REGISTER_MKL_KERNEL(kVTanh, VTanh); -REGISTER_MKL_KERNEL(kSeqPool, SeqPool); -REGISTER_MKL_KERNEL(kSoftmax, Softmax); +REGISTER_MKL_KERNEL(MatMul); +REGISTER_MKL_KERNEL(VMul); +REGISTER_MKL_KERNEL(VAdd); +REGISTER_MKL_KERNEL(VScal); +REGISTER_MKL_KERNEL(VExp); +REGISTER_MKL_KERNEL(VSquare); +REGISTER_MKL_KERNEL(VCopy); +REGISTER_MKL_KERNEL(VBroadcast); +REGISTER_MKL_KERNEL(VSigmoid); +REGISTER_MKL_KERNEL(VTanh); +REGISTER_MKL_KERNEL(SeqPool); +REGISTER_MKL_KERNEL(EmbSeqPool); +REGISTER_MKL_KERNEL(Softmax); +REGISTER_MKL_KERNEL(Sgd); #undef REGISTER_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 8130b87326f1887f232022ab30fa7bf42b0723e7..f51dca654cd3d93dcd396af7895aebf5ee915c22 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -18,6 +18,7 @@ #include #include #include "paddle/fluid/operators/jit/kernel_base.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { @@ -49,6 +50,13 @@ void VCopy(const T* x, T* y, int n); template void VAXPY(T a, const T* x, T* y, int n); +template +void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) { + for (int64_t h = 0; h < y_h; ++h) { + VCopy(x, y + h * x_len, x_len); + } +} + template void VSigmoid(const T* x, T* y, int n) { const T min = SIGMOID_THRESHOLD_MIN; @@ -91,6 +99,32 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { } } +template +void EmbSeqPool(const T* table, const int64_t* idx, T* out, + const emb_seq_pool_attr_t* attr) { + PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width); + auto check_idx_value_valid = [&](int64_t i) { + PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d", + idx[i], i); + PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i); + }; + + for (int64_t w = 0; w != attr->index_width; ++w) { + check_idx_value_valid(w); + VCopy(table + idx[w] * attr->table_width, out + w * attr->table_width, + attr->table_width); + } + + for (int64_t h = 1; h < attr->index_height; ++h) { + for (int64_t w = 0; w < attr->index_width; ++w) { + int64_t i = h * attr->index_width + w; + check_idx_value_valid(i); + VAXPY(static_cast(1), table + idx[i] * attr->table_width, + out + w * attr->table_width, attr->table_width); + } + } +} + template void ASum(const T* x, T* res, int n); @@ -115,34 +149,64 @@ void Softmax(const T* x, T* y, int n, int bs) { } } -#define DECLARE_MKL_KERNEL(name, tuples) \ - template \ - class name##Kernel : public KernelMore> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool UseMe(const typename tuples::attr_type&) const override; \ - const char* ImplType() const override { return "MKL"; } \ +template +void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, + T* out, const sgd_attr_t* attr) { + PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width); + PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height); + T scalar = -lr[0]; + int width = attr->grad_width; + if (out == param) { + for (int64_t i = 0; i < attr->selected_rows_size; ++i) { + auto h_idx = rows[i]; + PADDLE_ENFORCE_LT(h_idx, attr->param_height); + PADDLE_ENFORCE_GE(h_idx, 0); + VAXPY(scalar, grad + i * width, out + h_idx * width, width); + } + } else { + for (int64_t i = 0; i < attr->selected_rows_size; ++i) { + auto h_idx = rows[i]; + PADDLE_ENFORCE_LT(h_idx, attr->param_height); + PADDLE_ENFORCE_GE(h_idx, 0); + VScal(&scalar, grad + i * width, out + h_idx * width, width); + VAdd(param + h_idx * width, out + h_idx * width, out + h_idx * width, + width); + } + } +} + +#define DECLARE_MKL_KERNEL(name) \ + template \ + class name##Kernel : public KernelMore> { \ + public: \ + name##Kernel() { this->func = name; } \ + bool CanBeUsed(const typename name##Tuple::attr_type&) const override; \ + const char* ImplType() const override { return "MKL"; } \ } // ABCMNK -DECLARE_MKL_KERNEL(MatMul, MatMulTuples); +DECLARE_MKL_KERNEL(MatMul); // XYZN -DECLARE_MKL_KERNEL(VMul, XYZNTuples); -DECLARE_MKL_KERNEL(VAdd, XYZNTuples); +DECLARE_MKL_KERNEL(VMul); +DECLARE_MKL_KERNEL(VAdd); // AXYN -DECLARE_MKL_KERNEL(VScal, AXYNTuples); +DECLARE_MKL_KERNEL(VScal); // XYN -DECLARE_MKL_KERNEL(VExp, XYNTuples); -DECLARE_MKL_KERNEL(VSigmoid, XYNTuples); -DECLARE_MKL_KERNEL(VTanh, XYNTuples); -DECLARE_MKL_KERNEL(VSquare, XYNTuples); - -DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples); - -DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples); +DECLARE_MKL_KERNEL(VExp); +DECLARE_MKL_KERNEL(VSigmoid); +DECLARE_MKL_KERNEL(VTanh); +DECLARE_MKL_KERNEL(VSquare); +DECLARE_MKL_KERNEL(VCopy); + +// others +DECLARE_MKL_KERNEL(SeqPool); +DECLARE_MKL_KERNEL(EmbSeqPool); +DECLARE_MKL_KERNEL(Softmax); +DECLARE_MKL_KERNEL(Sgd); +DECLARE_MKL_KERNEL(VBroadcast); #undef DECLARE_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index 9f2935828ca300dbdb71b0fefb6b9883cb45e4b0..ffab9c1457b932b3211e6aa75954bb1435f8e34c 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -13,6 +13,7 @@ USE_JITKERNEL_REFER(kVAddRelu) USE_JITKERNEL_REFER(kVSub) USE_JITKERNEL_REFER(kVScal) USE_JITKERNEL_REFER(kVAddBias) +USE_JITKERNEL_REFER(kVCopy) USE_JITKERNEL_REFER(kVRelu) USE_JITKERNEL_REFER(kVIdentity) USE_JITKERNEL_REFER(kVExp) @@ -32,3 +33,6 @@ USE_JITKERNEL_REFER(kVSquare) USE_JITKERNEL_REFER(kHSum) USE_JITKERNEL_REFER(kHMax) USE_JITKERNEL_REFER(kSoftmax) +USE_JITKERNEL_REFER(kEmbSeqPool) +USE_JITKERNEL_REFER(kSgd) +USE_JITKERNEL_REFER(kVBroadcast) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index b8adb40ec7e1b64df2b04a3201292db235af7b19..0d1c4770903fc59160e308b958270e5826928d61 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -17,44 +17,43 @@ namespace refer = paddle::operators::jit::refer; -#define REGISTER_REFER_KERNEL(key, func) \ - REGISTER_JITKERNEL_REFER(key, refer::func##Kernel, \ +#define REGISTER_REFER_KERNEL(func) \ + REGISTER_JITKERNEL_REFER(k##func, refer::func##Kernel, \ refer::func##Kernel) -REGISTER_REFER_KERNEL(kVMul, VMul); -REGISTER_REFER_KERNEL(kVAdd, VAdd); -REGISTER_REFER_KERNEL(kVAddRelu, VAddRelu); -REGISTER_REFER_KERNEL(kVSub, VSub); - -REGISTER_REFER_KERNEL(kVScal, VScal); -REGISTER_REFER_KERNEL(kVAddBias, VAddBias); - -REGISTER_REFER_KERNEL(kVRelu, VRelu); -REGISTER_REFER_KERNEL(kVIdentity, VIdentity); -REGISTER_REFER_KERNEL(kVSquare, VSquare); -REGISTER_REFER_KERNEL(kVExp, VExp); -REGISTER_REFER_KERNEL(kVSigmoid, VSigmoid); -REGISTER_REFER_KERNEL(kVTanh, VTanh); - -REGISTER_REFER_KERNEL(kLSTMCtHt, LSTMCtHt); -REGISTER_REFER_KERNEL(kLSTMC1H1, LSTMC1H1); - -REGISTER_REFER_KERNEL(kGRUH1, GRUH1); -REGISTER_REFER_KERNEL(kGRUHtPart1, GRUHtPart1); -REGISTER_REFER_KERNEL(kGRUHtPart2, GRUHtPart2); - -REGISTER_REFER_KERNEL(kCRFDecoding, CRFDecoding); -REGISTER_REFER_KERNEL(kLayerNorm, LayerNorm); - -REGISTER_REFER_KERNEL(kNCHW16CMulNC, NCHW16CMulNC); - -REGISTER_REFER_KERNEL(kSeqPool, SeqPool); - -REGISTER_REFER_KERNEL(kMatMul, MatMul); - -REGISTER_REFER_KERNEL(kHMax, HMax); -REGISTER_REFER_KERNEL(kHSum, HSum); - -REGISTER_REFER_KERNEL(kSoftmax, Softmax); +REGISTER_REFER_KERNEL(VMul); +REGISTER_REFER_KERNEL(VAdd); +REGISTER_REFER_KERNEL(VAddRelu); +REGISTER_REFER_KERNEL(VSub); + +REGISTER_REFER_KERNEL(VScal); +REGISTER_REFER_KERNEL(VAddBias); + +REGISTER_REFER_KERNEL(VRelu); +REGISTER_REFER_KERNEL(VCopy); +REGISTER_REFER_KERNEL(VIdentity); +REGISTER_REFER_KERNEL(VSquare); +REGISTER_REFER_KERNEL(VExp); +REGISTER_REFER_KERNEL(VSigmoid); +REGISTER_REFER_KERNEL(VTanh); + +REGISTER_REFER_KERNEL(LSTMCtHt); +REGISTER_REFER_KERNEL(LSTMC1H1); + +REGISTER_REFER_KERNEL(GRUH1); +REGISTER_REFER_KERNEL(GRUHtPart1); +REGISTER_REFER_KERNEL(GRUHtPart2); + +REGISTER_REFER_KERNEL(CRFDecoding); +REGISTER_REFER_KERNEL(LayerNorm); +REGISTER_REFER_KERNEL(NCHW16CMulNC); +REGISTER_REFER_KERNEL(SeqPool); +REGISTER_REFER_KERNEL(MatMul); +REGISTER_REFER_KERNEL(HMax); +REGISTER_REFER_KERNEL(HSum); +REGISTER_REFER_KERNEL(Softmax); +REGISTER_REFER_KERNEL(EmbSeqPool); +REGISTER_REFER_KERNEL(Sgd); +REGISTER_REFER_KERNEL(VBroadcast); #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 0c4a985f8e8ece0a6169478fa3a9b111f5a6f3b4..cac705a484127b4813ef2d0996bf5aaee2b9f1b3 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -16,6 +16,7 @@ #include #include +#include #include "paddle/fluid/operators/jit/helper.h" #include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/platform/enforce.h" @@ -69,6 +70,20 @@ void VAddBias(const T* a, const T* x, T* y, int n) { } } +template +void VCopy(const T* x, T* y, int n) { + std::memcpy(y, x, n * sizeof(T)); +} + +// x shape: (x_len) +// y shape: (h, x_len) +template +void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) { + for (int64_t h = 0; h < y_h; ++h) { + VCopy(x, y + h * x_len, x_len); + } +} + template void VRelu(const T* x, T* y, int n) { for (int i = 0; i < n; ++i) { @@ -414,53 +429,115 @@ void Softmax(const T* x, T* y, int n, int bs = 1) { } } -#define DECLARE_REFER_KERNEL(name, tuples) \ - template \ - class name##Kernel : public ReferKernel> { \ - public: \ - name##Kernel() { this->func = name; } \ +// embedding seq pool +// table is a matrix with (tbl_h, tbl_w) +// idx is a matrix with (idx_h, idx_w) +// output is a vector with length tbl_w * idx_w +template +void EmbSeqPool(const T* table, const int64_t* idx, T* out, + const emb_seq_pool_attr_t* attr) { + PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width); + + auto check_idx_value_valid = [&](int64_t i) { + PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d", + idx[i], i); + PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i); + }; + + for (int64_t w = 0; w != attr->index_width; ++w) { + check_idx_value_valid(w); + std::memcpy(out + w * attr->table_width, table + idx[w] * attr->table_width, + attr->table_width * sizeof(T)); + } + + for (int64_t h = 1; h < attr->index_height; ++h) { + for (int64_t w = 0; w < attr->index_width; ++w) { + int64_t i = h * attr->index_width + w; + check_idx_value_valid(i); + VAdd(table + idx[i] * attr->table_width, out + w * attr->table_width, + out + w * attr->table_width, attr->table_width); + } + } +} + +// SGD algorithm: +// lr is pointor of learning rate scalar +// param is an input matrix with (param_h, param_w) +// grad is an input matrix with (grad_h, grad_w), here grad_w == param_w +// selected_rows is a vectot with size selected_rows_size( <= grad_h ) +// out is an output matrix with (param_h, param_w) +// +// support both regular and sparse grad +// regular SGD: out[:] = param[:] - lr[0] * grad[:]; +// sparse SGD: out[rows[i]][:] = param[rows[i]][:] - lr[0] * grad[i][:] +// +// Note: when use sparse SGD, and if out != param, +// the out rows which are not selected have not beed changed, which maybe empty +template +void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, + T* out, const sgd_attr_t* attr) { + PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width); + PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height); + for (int64_t i = 0; i < attr->selected_rows_size; ++i) { + auto h_idx = rows[i]; + PADDLE_ENFORCE_LT(h_idx, attr->param_height); + PADDLE_ENFORCE_GE(h_idx, 0); + for (int64_t j = 0; j < attr->grad_width; ++j) { + out[h_idx * attr->grad_width + j] = + param[h_idx * attr->grad_width + j] - + lr[0] * grad[i * attr->grad_width + j]; + } + } +} + +#define DECLARE_REFER_KERNEL(name) \ + template \ + class name##Kernel : public ReferKernel> { \ + public: \ + name##Kernel() { this->func = name; } \ } // const T* x, const T* y, T* z, int n -DECLARE_REFER_KERNEL(VMul, XYZNTuples); -DECLARE_REFER_KERNEL(VAdd, XYZNTuples); -DECLARE_REFER_KERNEL(VAddRelu, XYZNTuples); -DECLARE_REFER_KERNEL(VSub, XYZNTuples); +DECLARE_REFER_KERNEL(VMul); +DECLARE_REFER_KERNEL(VAdd); +DECLARE_REFER_KERNEL(VAddRelu); +DECLARE_REFER_KERNEL(VSub); // const T* a, const T* x, T* y, int n -DECLARE_REFER_KERNEL(VScal, AXYNTuples); -DECLARE_REFER_KERNEL(VAddBias, AXYNTuples); +DECLARE_REFER_KERNEL(VScal); +DECLARE_REFER_KERNEL(VAddBias); // const T* x, T* y, int n -DECLARE_REFER_KERNEL(VRelu, XYNTuples); -DECLARE_REFER_KERNEL(VIdentity, XYNTuples); -DECLARE_REFER_KERNEL(VExp, XYNTuples); -DECLARE_REFER_KERNEL(VSigmoid, XYNTuples); -DECLARE_REFER_KERNEL(VTanh, XYNTuples); -DECLARE_REFER_KERNEL(VSquare, XYNTuples); +DECLARE_REFER_KERNEL(VRelu); +DECLARE_REFER_KERNEL(VIdentity); +DECLARE_REFER_KERNEL(VExp); +DECLARE_REFER_KERNEL(VSigmoid); +DECLARE_REFER_KERNEL(VTanh); +DECLARE_REFER_KERNEL(VSquare); +DECLARE_REFER_KERNEL(VCopy); // lstm_t*, const lstm_attr_t* -DECLARE_REFER_KERNEL(LSTMCtHt, LSTMTuples); -DECLARE_REFER_KERNEL(LSTMC1H1, LSTMTuples); +DECLARE_REFER_KERNEL(LSTMCtHt); +DECLARE_REFER_KERNEL(LSTMC1H1); // gru_t*, const gru_attr_t* -DECLARE_REFER_KERNEL(GRUH1, GRUTuples); -DECLARE_REFER_KERNEL(GRUHtPart1, GRUTuples); -DECLARE_REFER_KERNEL(GRUHtPart2, GRUTuples); - -DECLARE_REFER_KERNEL(CRFDecoding, CRFDecodingTuples); -DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples); - -DECLARE_REFER_KERNEL(NCHW16CMulNC, NCHW16CMulNCTuples); - -DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples); - -DECLARE_REFER_KERNEL(MatMul, MatMulTuples); - -DECLARE_REFER_KERNEL(HMax, XRNTuples); -DECLARE_REFER_KERNEL(HSum, XRNTuples); - -DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples); +DECLARE_REFER_KERNEL(GRUH1); +DECLARE_REFER_KERNEL(GRUHtPart1); +DECLARE_REFER_KERNEL(GRUHtPart2); + +DECLARE_REFER_KERNEL(HMax); +DECLARE_REFER_KERNEL(HSum); + +// others +DECLARE_REFER_KERNEL(CRFDecoding); +DECLARE_REFER_KERNEL(LayerNorm); +DECLARE_REFER_KERNEL(NCHW16CMulNC); +DECLARE_REFER_KERNEL(SeqPool); +DECLARE_REFER_KERNEL(MatMul); +DECLARE_REFER_KERNEL(Softmax); +DECLARE_REFER_KERNEL(EmbSeqPool); +DECLARE_REFER_KERNEL(Sgd); +DECLARE_REFER_KERNEL(VBroadcast); #undef DECLARE_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/registry.h b/paddle/fluid/operators/jit/registry.h index cb32c487208fe8fe9e72c069db8833c736316aec..567a903236979ff4ac6095033f53d2a473f4eb2c 100644 --- a/paddle/fluid/operators/jit/registry.h +++ b/paddle/fluid/operators/jit/registry.h @@ -17,6 +17,7 @@ #include #include #include +#include // for std::move #include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/operators/jit/kernel_pool.h" #include "paddle/fluid/platform/place.h" @@ -49,8 +50,8 @@ struct JitKernelRegistrarFunctor { void operator()(KernelType kt) const { KernelKey kkey(kt, PlaceType()); - Pool().Instance().Insert(kkey, - std::move(make_unique())); + Pool::Instance().Insert(kkey, + std::move(make_unique())); constexpr auto size = std::tuple_size>::value; JitKernelRegistrarFunctor diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 237e588d35cc3b33658a830db34676967818aab6..6c099a7a062472e2701401ddc58bb9051074f810 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -1,17 +1,19 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include #include #include #include @@ -25,8 +27,8 @@ DEFINE_double(acc, 1e-5, "Test accuracy threshold."); template -void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), - const T upper = static_cast(20.f)) { +void RandomVec(const int n, T* a, const T lower = static_cast(-2.f), + const T upper = static_cast(2.f)) { static unsigned int seed = 100; std::mt19937 rng(seed++); std::uniform_real_distribution uniform_dist(0, 1); @@ -36,14 +38,14 @@ void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), } template -void ExpectEQ(const T* target, const T* refer, int n) { +void ExpectEQ(const T* target, const T* refer, size_t n) { if (std::is_floating_point::value) { - for (int i = 0; i < n; ++i) { - EXPECT_NEAR(target[i], refer[i], FLAGS_acc); + for (size_t i = 0; i < n; ++i) { + EXPECT_NEAR(target[i], refer[i], FLAGS_acc) << " at index : " << i; } } else { - for (int i = 0; i < n; ++i) { - EXPECT_EQ(target[i], refer[i]); + for (size_t i = 0; i < n; ++i) { + EXPECT_EQ(target[i], refer[i]) << " at index : " << i; } } } @@ -63,271 +65,23 @@ std::vector TestSizes() { namespace jit = paddle::operators::jit; using CPUPlace = paddle::platform::CPUPlace; -template -struct TestFuncWithRefer { - void operator()(const typename KernelTuples::func_type tgt, Args... args) { - LOG(FATAL) << "Should specify this function."; - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - std::vector> { - void operator()(const typename jit::XYZNTuples::func_type tgt, - const std::vector& x, const std::vector& y, - const std::vector& zref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(zref.size(), x.size()); - EXPECT_EQ(zref.size(), y.size()); - const T* x_data = x.data(); - const T* y_data = y.data(); - const T* zref_data = zref.data(); - const int d = zref.size(); - - std::vector ztgt(d); - T* ztgt_data = ztgt.data(); - // test normal - tgt(x_data, y_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ztgt.begin()); - tgt(ztgt_data, y_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - // test inplace y - std::copy(y.begin(), y.end(), ztgt.begin()); - tgt(x_data, ztgt_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - } -}; - -template -struct TestFuncWithRefer, T, std::vector, - std::vector> { - void operator()(const typename jit::AXYNTuples::func_type tgt, const T a, - const std::vector& x, const std::vector& yref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - const int d = yref.size(); - std::vector ytgt(d); - T* ytgt_data = ytgt.data(); - // test normal - tgt(&a, x_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(&a, ytgt_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - int, int> { - void operator()(const typename jit::SoftmaxTuples::func_type tgt, - const std::vector& x, const std::vector& yref, int n, - int bs) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - EXPECT_EQ(x.size(), static_cast(n * bs)); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - std::vector ytgt(n * bs); - T* ytgt_data = ytgt.data(); - // test normal - tgt(x_data, ytgt_data, n, bs); - ExpectEQ(ytgt_data, yref_data, n * bs); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(ytgt_data, ytgt_data, n, bs); - ExpectEQ(ytgt_data, yref_data, n * bs); - } -}; - -template -struct TestFuncWithRefer, std::vector, T> { - void operator()(const typename jit::XRNTuples::func_type tgt, - const std::vector& x, const T ref_res) { - EXPECT_TRUE(tgt != nullptr); - T tgt_res; - tgt(x.data(), &tgt_res, x.size()); - ExpectEQ(&tgt_res, &ref_res, 1); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector> { - void operator()(const typename jit::XYNTuples::func_type tgt, - const std::vector& x, const std::vector& yref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - const int d = yref.size(); - std::vector ytgt(d); - T* ytgt_data = ytgt.data(); - // test normal - tgt(x_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(ytgt_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - std::vector, std::vector, std::vector, - typename jit::LSTMTuples::attr_type> { - void operator()(const typename jit::LSTMTuples::func_type tgt, - const std::vector& xsrc, const std::vector& wp, - const std::vector& ct_1, const std::vector& ct_ref, - const std::vector& ht_ref, - const typename jit::LSTMTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(ct_ref.size(), ht_ref.size()); - EXPECT_EQ(ct_1.size(), ht_ref.size()); - EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); - EXPECT_EQ(wp.size(), 3 * ht_ref.size()); - - // x could be changed after compute, so copy to save src - int d = ht_ref.size(); - std::vector x(xsrc.size()), ct(ct_ref.size()), ht(ht_ref.size()); - std::vector checked(2 * d); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - - const T* ct_1_data = ct_1.data(); - const T* wp_data = wp.data(); - const T* ct_ref_data = ct_ref.data(); - const T* ht_ref_data = ht_ref.data(); - T* x_data = x.data(); - T* ct_data = ct.data(); - T* ht_data = ht.data(); - T* checked_data = checked.data(); - - jit::lstm_t step; - step.gates = x_data; - step.ct_1 = ct_1_data; - step.ct = ct_data; - step.ht = ht_data; - if (attr.use_peephole) { - step.wp = wp_data; - step.checked = checked_data; - } - - tgt(&step, &attr); - ExpectEQ(ct_data, ct_ref_data, d); - ExpectEQ(ht_data, ht_ref_data, d); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - std::vector, - typename jit::GRUTuples::attr_type> { - void operator()(const typename jit::GRUTuples::func_type tgt, - const std::vector& xsrc, const std::vector& ht_1, - const std::vector& ht_ref, - const typename jit::GRUTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(ht_1.size(), ht_ref.size()); - EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); - - // x could be changed after compute, so copy to save src - int d = ht_ref.size(); - std::vector x(xsrc.size()), ht(ht_ref.size()); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - const T* ht_1_data = ht_1.data(); - const T* ht_ref_data = ht_ref.data(); - T* x_data = x.data(); - T* ht_data = ht.data(); - jit::gru_t step; - step.gates = x_data; - step.ht_1 = ht_1_data; - step.ht = ht_data; - tgt(&step, &attr); - ExpectEQ(ht_data, ht_ref_data, d); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - typename jit::SeqPoolTuples::attr_type> { - void operator()(const typename jit::SeqPoolTuples::func_type tgt, - const std::vector& x, const std::vector& yref, - const typename jit::SeqPoolTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(x.size() % yref.size(), 0); - int w = yref.size(); - std::vector y(w); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - T* y_data = y.data(); - tgt(x_data, y_data, &attr); - ExpectEQ(y_data, yref_data, w); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - std::vector, - typename jit::MatMulTuples::attr_type> { - void operator()(const typename jit::MatMulTuples::func_type tgt, - const std::vector& a, const std::vector& b, - const std::vector& cref, - const typename jit::MatMulTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(a.size(), static_cast(attr.m * attr.k)); - EXPECT_EQ(b.size(), static_cast(attr.k * attr.n)); - EXPECT_EQ(cref.size(), static_cast(attr.m * attr.n)); - std::vector c(cref.size()); - const T* a_data = a.data(); - const T* b_data = b.data(); - const T* cref_data = cref.data(); - T* c_data = c.data(); - tgt(a_data, b_data, c_data, &attr); - ExpectEQ(c_data, cref_data, attr.m * attr.n); - } -}; - -template -void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { - TestFuncWithRefer test; - // test jitcode - auto jitcode = jit::GetJitCode(attr); - if (jitcode) { - VLOG(10) << "Test Jitcode Kernel "; - test(jitcode, args...); - } - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); - if (i && i->UseMe(attr)) { - auto more = i->GetFunc(); - VLOG(10) << "Test More Kernel : " << i->ImplType(); - test(more, args...); - } - } +void TestAllImpls(const typename KernelTuple::attr_type& attr, + const Tester& verifier, const Args&... args) { + auto funcs = jit::GetAllCandidateFuncsWithTypes(attr); + for (auto f : funcs) { + VLOG(10) << "Test Kernel " << f.first; + verifier(f.second, args...); } - // test result from Get function - // VLOG(10) << "Test Get function "; - auto tgt = jit::Get(attr); - test(tgt, args...); } -template -void TestXYZNKernel() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelXYZN() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(d), y(d), zref(d); @@ -351,16 +105,42 @@ void TestXYZNKernel() { ExpectEQ(xinp_data, zref_data, d); ExpectEQ(yinp_data, zref_data, d); - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(d, x, y, zref); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& y, + const std::vector& zref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(zref.size(), x.size()); + EXPECT_EQ(zref.size(), y.size()); + const T* x_data = x.data(); + const T* y_data = y.data(); + const T* zref_data = zref.data(); + const int d = zref.size(); + + std::vector ztgt(d); + T* ztgt_data = ztgt.data(); + // test normal + tgt(x_data, y_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ztgt.begin()); + tgt(ztgt_data, y_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + // test inplace y + std::copy(y.begin(), y.end(), ztgt.begin()); + tgt(x_data, ztgt_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + }; + + TestAllImpls(d, verifier, x, y, zref); } } -template -void TestAXYNKernel() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelAXYN() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); const T a = static_cast(3); @@ -377,39 +157,38 @@ void TestAXYNKernel() { ref(&a, xinp_data, xinp_data, d); ExpectEQ(xinp_data, yref_data, d); - TestAllImpls, PlaceType, T, std::vector, - std::vector>(d, a, x, yref); - } -} - -template -void TestXRNKernel() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - auto last_acc = FLAGS_acc; - FLAGS_acc = 1e-4; - for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - std::vector x(d); - RandomVec(d, x.data(), -2.f, 2.f); - T ref_res; - ref(x.data(), &ref_res, d); - TestAllImpls, PlaceType, std::vector, T>(d, x, - ref_res); + auto verifier = [](const typename KernelTuple::func_type tgt, const T a, + const std::vector& x, const std::vector& yref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(&a, x_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(&a, ytgt_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + }; + TestAllImpls(d, verifier, a, x, yref); } - FLAGS_acc = last_acc; } -template -void TestXYNKernel() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelXYN() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(d), yref(d); std::vector xinp(d); // inplace test - RandomVec(d, x.data(), -2.f, 2.f); + RandomVec(d, x.data()); std::copy(x.begin(), x.end(), xinp.begin()); const T* x_data = x.data(); @@ -419,17 +198,61 @@ void TestXYNKernel() { ref(x_data, yref_data, d); ref(xinp_data, xinp_data, d); ExpectEQ(xinp_data, yref_data, d); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(x_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(ytgt_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + }; + TestAllImpls(d, verifier, x, yref); + } +} + +template +void TestKernelXRN() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + auto last_acc = FLAGS_acc; + FLAGS_acc = 1e-4; + for (int d : TestSizes()) { + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + std::vector x(d); + RandomVec(d, x.data()); + T ref_res; + ref(x.data(), &ref_res, d); - TestAllImpls, PlaceType, std::vector, - std::vector>(d, x, yref); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const T ref_res) { + EXPECT_TRUE(tgt != nullptr); + T tgt_res; + tgt(x.data(), &tgt_res, x.size()); + ExpectEQ(&tgt_res, &ref_res, 1); + }; + TestAllImpls(d, verifier, x, ref_res); } + FLAGS_acc = last_acc; } -template -void TestLSTMKernel() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelLSTM() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; - for (int d : TestSizes()) { + auto test_sizes = TestSizes(); + test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); + for (int d : test_sizes) { for (bool use_peephole : {true, false}) { for (auto& act_gate : all_acts) { for (auto& act_cand : all_acts) { @@ -437,11 +260,11 @@ void TestLSTMKernel() { const jit::lstm_attr_t attr( d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand), jit::to_kerneltype(act_cell), use_peephole); - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector xsrc(4 * d), wp(3 * d), ct_1(d); std::vector ct_ref(d), ht_ref(d), checked(2 * d); - RandomVec(4 * d, xsrc.data(), -2.f, 2.f); + RandomVec(4 * d, xsrc.data()); RandomVec(3 * d, wp.data(), -1.f, 1.f); RandomVec(d, ct_1.data(), -1.f, 1.f); // x could be changed after compute, so copy to save src @@ -464,10 +287,51 @@ void TestLSTMKernel() { } ref(&step, &attr); VLOG(10) << attr; - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector, std::vector, - std::vector>(attr, xsrc, wp, ct_1, ct_ref, ht_ref, - attr); + + auto verifier = []( + const typename KernelTuple::func_type tgt, + const std::vector& xsrc, const std::vector& wp, + const std::vector& ct_1, const std::vector& ct_ref, + const std::vector& ht_ref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ct_ref.size(), ht_ref.size()); + EXPECT_EQ(ct_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); + EXPECT_EQ(wp.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ct(ct_ref.size()), + ht(ht_ref.size()); + std::vector checked(2 * d); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + + const T* ct_1_data = ct_1.data(); + const T* wp_data = wp.data(); + const T* ct_ref_data = ct_ref.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ct_data = ct.data(); + T* ht_data = ht.data(); + T* checked_data = checked.data(); + + jit::lstm_t step; + step.gates = x_data; + step.ct_1 = ct_1_data; + step.ct = ct_data; + step.ht = ht_data; + if (attr.use_peephole) { + step.wp = wp_data; + step.checked = checked_data; + } + + tgt(&step, &attr); + ExpectEQ(ct_data, ct_ref_data, d); + ExpectEQ(ht_data, ht_ref_data, d); + }; + TestAllImpls(attr, verifier, xsrc, wp, ct_1, + ct_ref, ht_ref, attr); } } } @@ -475,20 +339,23 @@ void TestLSTMKernel() { } } -template -void TestGRUKernel() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelGRU() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; - for (int d : TestSizes()) { + auto test_sizes = TestSizes(); + test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); + for (int d : test_sizes) { for (auto& act_gate : all_acts) { for (auto& act_cand : all_acts) { const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand)); - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector xsrc(3 * d), ht_1(d), ht_ref(d); - RandomVec(3 * d, xsrc.data(), -2.f, 2.f); - RandomVec(d, ht_1.data(), -2.f, 2.f); + RandomVec(3 * d, xsrc.data()); + RandomVec(d, ht_1.data()); // x could be changed after compute, so copy to save src std::vector x(xsrc.size()); std::copy(xsrc.begin(), xsrc.end(), x.begin()); @@ -501,76 +368,365 @@ void TestGRUKernel() { step.ht = ht_ref_data; ref(&step, &attr); VLOG(10) << attr; - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(attr, xsrc, ht_1, ht_ref, - attr); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& xsrc, + const std::vector& ht_1, + const std::vector& ht_ref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ht_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ht(ht_ref.size()); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + const T* ht_1_data = ht_1.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ht_data = ht.data(); + jit::gru_t step; + step.gates = x_data; + step.ht_1 = ht_1_data; + step.ht = ht_data; + tgt(&step, &attr); + ExpectEQ(ht_data, ht_ref_data, d); + }; + TestAllImpls(attr, verifier, xsrc, ht_1, ht_ref, + attr); + } + } + } +} + +template +void TestKernelNCHW16CMulNC() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + const int n = 3, c = 16 * 4, h = 10, w = 10; + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + int sz = n * c * h * w; + std::vector x(sz), y(n * c), zref(sz); + std::vector ztgt(sz), zjit(sz); + RandomVec(sz, x.data()); + RandomVec(n * c, y.data()); + + const T* x_data = x.data(); + const T* y_data = y.data(); + T* zref_data = zref.data(); + T* ztgt_data = ztgt.data(); + T* zjit_data = zjit.data(); + constexpr int simd_width = ZMM_FLOAT_BLOCK; + int C = c / simd_width; + auto tgt = jit::KernelFuncs::Cache().At(0); + auto funcs = jit::GetAllCandidateFuncs(0); + EXPECT_GT(funcs.size(), 0UL); + auto jitcode = funcs[0]; + EXPECT_TRUE(tgt != nullptr); + + if (std::is_same::value && + paddle::platform::MayIUse(paddle::platform::avx512f)) { + EXPECT_TRUE(jitcode != nullptr); + } + for (int ni = 0; ni < n; ni++) { + for (int ci = 0; ci < C; ci++) { + auto ptr_x = + x_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + auto ptr_y = y_data + ni * C * simd_width + ci * simd_width; + auto ptr_zref = + zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + auto ptr_ztgt = + ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + + ref(ptr_x, ptr_y, ptr_zref, h, w); + tgt(ptr_x, ptr_y, ptr_ztgt, h, w); + + if (jitcode) { + auto ptr_zjit = + zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + jitcode(ptr_x, ptr_y, ptr_zjit, h, w); } } } + ExpectEQ(ztgt_data, zref_data, sz); + if (jitcode) { + ExpectEQ(zjit_data, zref_data, sz); + } +} + +template +void TestKernelLayerNorm() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + const T epsilon = 9.99999975e-06; + for (int n : {1, 2, 10}) { + for (int x_dim_0 : {1, 9, 17, 50}) { + int left = n * x_dim_0; + for (int x_dim_1 : TestSizes()) { + int right = x_dim_1; + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + int sz = left * right; + std::vector x(sz), mean(left), var(left), scale(right), bias(right), + outref(sz); + RandomVec(sz, x.data()); + RandomVec(left, mean.data()); + RandomVec(left, var.data()); + RandomVec(right, scale.data()); + RandomVec(right, bias.data()); + + const T* scale_data = scale.data(); + const T* bias_data = bias.data(); + T* x_data = x.data(); + T* mean_data = mean.data(); + T* var_data = var.data(); + T* outref_data = outref.data(); + + ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data, + left, epsilon, right); + + auto verifier = []( + const typename KernelTuple::func_type tgt, const std::vector& x_, + const std::vector& outref_, const std::vector& mean_, + const std::vector& var_, const std::vector& scale, + const std::vector& bias, const int& left, const float& epsilon, + const typename KernelTuple::attr_type& right) { + EXPECT_TRUE(tgt != nullptr); + std::vector outtgt(outref_.size()); + std::vector x(x_.size()); + std::vector mean(mean_.size()); + std::vector var(var_.size()); + std::vector outref(outref_.size()); + std::copy(x_.begin(), x_.end(), x.begin()); + std::copy(mean_.begin(), mean_.end(), mean.begin()); + std::copy(var_.begin(), var_.end(), var.begin()); + std::copy(outref_.begin(), outref_.end(), outref.begin()); + + EXPECT_EQ(x.size(), static_cast(left * right)); + EXPECT_EQ(outref.size(), static_cast(left * right)); + EXPECT_EQ(mean.size(), static_cast(left)); + EXPECT_EQ(var.size(), static_cast(left)); + EXPECT_EQ(scale.size(), static_cast(right)); + EXPECT_EQ(bias.size(), static_cast(right)); + + const T* scale_data = scale.data(); + const T* bias_data = bias.data(); + T* x_data = x.data(); + T* mean_data = mean.data(); + T* var_data = var.data(); + T* outref_data = outref.data(); + T* outtgt_data = outtgt.data(); + tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, + left, epsilon, right); + ExpectEQ(outtgt_data, outref_data, left * right); + }; + TestAllImpls(right, verifier, x, outref, mean, + var, scale, bias, left, epsilon, + right); + } + } + } +} + +template +void TestKernelCRFDecoding() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + constexpr int state_trans_base_idx = 2; + auto test_sizes = TestSizes(); + test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000)); + for (int seq_len : {1, 11, 17, 50}) { + for (int tag_num : test_sizes) { + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + int x_sz = seq_len * tag_num; + int w_sz = (tag_num + state_trans_base_idx) * tag_num; + std::vector x(x_sz), w(w_sz), alpharef(x_sz); + std::vector trackref(x_sz); + RandomVec(x_sz, x.data()); + RandomVec(w_sz, w.data()); + + ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(), + trackref.data(), tag_num); + + auto verifier = []( + const typename KernelTuple::func_type tgt, const int& seq_len, + const std::vector& x, const std::vector& w, + const std::vector& alpharef, const std::vector& trackref, + const typename KernelTuple::attr_type& tag_num) { + constexpr int state_trans_base_idx = 2; + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(x.size(), static_cast(seq_len * tag_num)); + EXPECT_EQ(w.size(), static_cast( + (tag_num + state_trans_base_idx) * tag_num)); + EXPECT_EQ(alpharef.size(), static_cast(seq_len * tag_num)); + EXPECT_EQ(trackref.size(), static_cast(seq_len * tag_num)); + std::vector alphatgt(alpharef.size()); + std::vector tracktgt(trackref.size()); + memcpy(tracktgt.data(), trackref.data(), tag_num * sizeof(int)); + tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(), + tracktgt.data(), tag_num); + ExpectEQ(alpharef.data(), alphatgt.data(), seq_len * tag_num); + ExpectEQ(trackref.data(), tracktgt.data(), seq_len * tag_num); + }; + TestAllImpls(tag_num, verifier, seq_len, x, w, + alpharef, trackref, tag_num); + } + } } -template -void TestSeqPoolKernel() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelSeqPool() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); std::vector pool_types = { jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; + auto test_sizes = TestSizes(); + test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); for (auto type : pool_types) { - for (int w : TestSizes()) { + for (int w : test_sizes) { jit::seq_pool_attr_t attr(w, type); - for (int h : TestSizes()) { + for (int h : test_sizes) { attr.h = h; - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(h * w), yref(w); - RandomVec(h * w, x.data(), -2.f, 2.f); + RandomVec(h * w, x.data()); const T* x_data = x.data(); T* yref_data = yref.data(); ref(x_data, yref_data, &attr); VLOG(10) << attr; - TestAllImpls, PlaceType, std::vector, - std::vector>(attr, x, yref, attr); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(x.size() % yref.size(), static_cast(0)); + int w = yref.size(); + std::vector y(w); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + T* y_data = y.data(); + tgt(x_data, y_data, &attr); + ExpectEQ(y_data, yref_data, w); + }; + TestAllImpls(attr, verifier, x, yref, attr); } } } } -template -void TestMatMulKernel() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelEmbSeqPool() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + int64_t tbl_h = 1e4; + std::vector pool_types = { + jit::SeqPoolType::kSum}; // only support sum yet + auto test_sizes = TestSizes(); + test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); + for (int tbl_w : test_sizes) { + std::vector table(tbl_h * tbl_w); + RandomVec(tbl_h * tbl_w, table.data()); + const T* table_data = table.data(); + for (auto type : pool_types) { + for (int idx_w : {1, 2, 10, 16}) { + for (int idx_h : {1, 2, 9, 13, 16}) { + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + std::vector idx(idx_h * idx_w); + RandomVec(idx_h * idx_w, idx.data(), 0, tbl_h - 1); + int64_t out_w = tbl_w * idx_w; + std::vector oref(out_w); + const int64_t* idx_data = idx.data(); + T* o_data = oref.data(); + jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w, + type); + ref(table_data, idx_data, o_data, &attr); + + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& table, + const std::vector& idx, + const std::vector& oref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(table.size(), static_cast(attr.table_height * + attr.table_width)); + EXPECT_EQ(idx.size(), static_cast(attr.index_height * + attr.index_width)); + EXPECT_EQ(oref.size(), + static_cast(attr.table_width * attr.index_width)); + const T* table_data = table.data(); + const int64_t* idx_data = idx.data(); + const T* oref_data = oref.data(); + int o_w = oref.size(); + std::vector out(o_w); + T* o_data = out.data(); + tgt(table_data, idx_data, o_data, &attr); + ExpectEQ(o_data, oref_data, o_w); + }; + TestAllImpls(attr, verifier, table, idx, oref, + attr); + } + } + } + } +} + +template +void TestKernelMatMul() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); auto last_acc = FLAGS_acc; - // TODO(intel): fix MKL acc issue - // https://github.com/PaddlePaddle/Paddle/issues/15447 + // export MKL_CBWR=AVX would make MKL force to use AVX + // export KMP_DETERMINISTIC_REDUCTION=yes would make the result deterministic FLAGS_acc = 1e-3; for (int m : {1, 2, 3, 4}) { for (int n : {1, 2, 3, 4}) { for (int k : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector a(m * k), b(k * n), c(m * n); - RandomVec(m * k, a.data(), -2.f, 2.f); - RandomVec(k * n, b.data(), -2.f, 2.f); + RandomVec(m * k, a.data()); + RandomVec(k * n, b.data()); const T* a_data = a.data(); const T* b_data = b.data(); T* c_data = c.data(); const jit::matmul_attr_t attr{m, n, k}; ref(a_data, b_data, c_data, &attr); - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(attr, a, b, c, attr); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& a, const std::vector& b, + const std::vector& cref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(a.size(), static_cast(attr.m * attr.k)); + EXPECT_EQ(b.size(), static_cast(attr.k * attr.n)); + EXPECT_EQ(cref.size(), static_cast(attr.m * attr.n)); + std::vector c(cref.size()); + const T* a_data = a.data(); + const T* b_data = b.data(); + const T* cref_data = cref.data(); + T* c_data = c.data(); + tgt(a_data, b_data, c_data, &attr); + ExpectEQ(c_data, cref_data, attr.m * attr.n); + }; + TestAllImpls(attr, verifier, a, b, c, attr); } } } FLAGS_acc = last_acc; } -template -void TestSoftmaxKernel() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelSoftmax() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int bs : {1, 2, 10}) { for (int n : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(bs * n), y(bs * n); - RandomVec(bs * n, x.data(), -2.f, 2.f); + RandomVec(bs * n, x.data()); const T* x_data = x.data(); T* y_data = y.data(); @@ -581,188 +737,558 @@ void TestSoftmaxKernel() { ref(xinp_data, xinp_data, n, bs); ExpectEQ(xinp_data, y_data, n * bs); - TestAllImpls, PlaceType, std::vector, - std::vector>(n, x, y, n, bs); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref, + int n, int bs) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + EXPECT_EQ(x.size(), static_cast(n * bs)); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + std::vector ytgt(n * bs); + T* ytgt_data = ytgt.data(); + // test normal + tgt(x_data, ytgt_data, n, bs); + ExpectEQ(ytgt_data, yref_data, n * bs); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(ytgt_data, ytgt_data, n, bs); + ExpectEQ(ytgt_data, yref_data, n * bs); + }; + TestAllImpls(n, verifier, x, y, n, bs); } } } -template -void TestNCHW16CMulNCKernel() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - const int n = 3, c = 16 * 4, h = 10, w = 10; - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - int sz = n * c * h * w; - std::vector x(sz), y(n * c), zref(sz); - std::vector ztgt(sz), zjit(sz); - RandomVec(sz, x.data(), -2.f, 2.f); - RandomVec(n * c, y.data(), -2.f, 2.f); - - const T* x_data = x.data(); - const T* y_data = y.data(); - T* zref_data = zref.data(); - T* ztgt_data = ztgt.data(); - T* zjit_data = zjit.data(); - constexpr int simd_width = ZMM_FLOAT_BLOCK; - int C = c / simd_width; - auto tgt = jit::Get, PlaceType>(0); - auto jitcode = jit::GetJitCode, PlaceType>(0); - EXPECT_TRUE(tgt != nullptr); - - if (std::is_same::value && - paddle::platform::MayIUse(paddle::platform::avx512f)) { - EXPECT_TRUE(jitcode != nullptr); - } - for (int ni = 0; ni < n; ni++) { - for (int ci = 0; ci < C; ci++) { - auto ptr_x = - x_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - auto ptr_y = y_data + ni * C * simd_width + ci * simd_width; - auto ptr_zref = - zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - auto ptr_ztgt = - ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width; +template +void TestKernelSgd() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + const T lr = 0.1; + auto UnDuplicatedRandomVec = [](int n, const int64_t lower, + const int64_t upper) -> std::vector { + PADDLE_ENFORCE_LE(static_cast(upper - lower), n - 1); + PADDLE_ENFORCE_GT(n, 0); + std::vector all, out; + for (int i = 0; i < n; ++i) { + all.push_back(i); + } + std::random_shuffle(all.begin(), all.end()); + out.insert(out.begin(), all.begin(), all.begin() + n); + return out; + }; + for (int param_h : {1, 10}) { + for (int grad_w : TestSizes()) { + std::vector param(param_h * grad_w); + std::vector param_out(param_h * grad_w); + RandomVec(param_h * grad_w, param.data()); + const T* param_data = param.data(); + T* out_data = param_out.data(); + for (int rows_size = 1; rows_size <= param_h; ++rows_size) { + std::vector grad(rows_size * grad_w); + std::vector rows = + UnDuplicatedRandomVec(rows_size, 0, rows_size - 1); + RandomVec(rows_size * grad_w, grad.data()); + const int64_t* rows_data = rows.data(); + const T* grad_data = grad.data(); + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size); + ref(&lr, param_data, grad_data, rows_data, out_data, &attr); + + // inplace test + std::vector inp(param.size()); + std::copy(param.begin(), param.end(), inp.begin()); + T* inp_data = inp.data(); + ref(&lr, inp_data, grad_data, rows_data, inp_data, &attr); + // only the selected rows should be equal + for (int i = 0; i < rows_size; ++i) { + ExpectEQ(inp_data + rows[i] * grad_w, out_data + rows[i] * grad_w, + grad_w); + } - ref(ptr_x, ptr_y, ptr_zref, h, w); - tgt(ptr_x, ptr_y, ptr_ztgt, h, w); + auto verifier = []( + const typename KernelTuple::func_type tgt, const T lr, + const std::vector& param, const std::vector& grad, + const std::vector& rows, const std::vector& oref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(param.size(), + static_cast(attr.param_height * attr.param_width)); + EXPECT_EQ(grad.size(), + static_cast(attr.grad_height * attr.grad_width)); + EXPECT_EQ(rows.size(), static_cast(attr.selected_rows_size)); + EXPECT_EQ(param.size(), oref.size()); + const T* param_data = param.data(); + const T* grad_data = grad.data(); + const int64_t* rows_data = rows.data(); + const T* oref_data = oref.data(); + + std::vector out(oref.size()); + T* o_data = out.data(); + tgt(&lr, param_data, grad_data, rows_data, o_data, &attr); + // only the selected rows should be equal + for (size_t i = 0; i < rows.size(); ++i) { + ExpectEQ(o_data + rows[i] * attr.grad_width, + oref_data + rows[i] * attr.grad_width, attr.grad_width); + } - if (jitcode) { - auto ptr_zjit = - zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - jitcode(ptr_x, ptr_y, ptr_zjit, h, w); + // inplace + std::copy(param.begin(), param.end(), out.begin()); + tgt(&lr, o_data, grad_data, rows_data, o_data, &attr); + for (size_t i = 0; i < rows.size(); ++i) { + ExpectEQ(o_data + rows[i] * attr.grad_width, + oref_data + rows[i] * attr.grad_width, attr.grad_width); + } + }; + TestAllImpls(attr, verifier, lr, param, grad, + rows, param_out, attr); } } } - ExpectEQ(ztgt_data, zref_data, sz); - if (jitcode) { - ExpectEQ(zjit_data, zref_data, sz); - } -} - -// XYZNTuple -TEST(JITKernel, kVMul) { - TestXYZNKernel(); - TestXYZNKernel(); -} - -TEST(JITKernel, kVAdd) { - TestXYZNKernel(); - TestXYZNKernel(); } -TEST(JITKernel, kVAddRelu) { - TestXYZNKernel(); - TestXYZNKernel(); -} - -TEST(JITKernel, kVSub) { - TestXYZNKernel(); - TestXYZNKernel(); +template +void TestKernelVBroadcast() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + for (int w : TestSizes()) { + std::vector x(w); + RandomVec(w, x.data()); + const T* x_data = x.data(); + for (int64_t h : {1, 2, 6}) { + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + std::vector y(w * h); + T* y_data = y.data(); + ref(x_data, y_data, h, w); + + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref, + const int64_t& h, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(x.size(), static_cast(attr)); + EXPECT_EQ(yref.size(), x.size() * h); + std::vector y(yref.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + T* y_data = y.data(); + tgt(x_data, y_data, h, attr); + ExpectEQ(y_data, yref_data, yref.size()); + }; + TestAllImpls(static_cast(w), verifier, x, + y, h, static_cast(w)); + } + } } -// AXYNTuples -TEST(JITKernel, kVScal) { - TestAXYNKernel(); - TestAXYNKernel(); +// test pool +TEST(JITKernel_pool, jitcreator) { + const auto& jitcreators = jit::JitCodeCreatorPool::Instance().AllCreators(); +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_EQ(jitcreators.size(), 0UL); +#else + EXPECT_EQ(jitcreators.size(), 25UL); +#endif } -TEST(JITKernel, kVAddBias) { - TestAXYNKernel(); - TestAXYNKernel(); +TEST(JITKernel_pool, jitpool) { + // jitpool is related with attr + const auto& kers = jit::JitCodePool().Instance().AllKernels(); + EXPECT_EQ(kers.size(), 0UL); + jit::GetAllCandidateKernels, CPUPlace>(3); +// after call GetAllCandidateKernels, it will create jitcode Automatically +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_EQ(kers.size(), 0UL); +#else + EXPECT_EQ(kers.size(), 1UL); +#endif } -// XRNTuples -TEST(JITKernel, kHMax) { - TestXRNKernel(); - TestXRNKernel(); +TEST(JITKernel_pool, more) { + const auto& kers = jit::KernelPool::Instance().AllKernels(); +#if defined(__APPLE__) || defined(__OSX__) + EXPECT_EQ(kers.size(), 10UL); +#else +#ifdef PADDLE_WITH_MKLML + EXPECT_EQ(kers.size(), 21UL); +#else + EXPECT_EQ(kers.size(), 8UL); +#endif +#endif } -TEST(JITKernel, kHSum) { - TestXRNKernel(); - TestXRNKernel(); +TEST(JITKernel_pool, refer) { + const auto& kers = jit::ReferKernelPool::Instance().AllKernels(); + EXPECT_EQ(kers.size(), 29UL); } -// XYNTuples -TEST(JITKernel, kVRelu) { - TestXYNKernel(); - TestXYNKernel(); +// test helper +TEST(JITKernel_helper, GetAllCandidateKernels) { + auto fp_kers = + jit::GetAllCandidateKernels, CPUPlace>(10); +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_GE(fp_kers.size(), 1UL); // refer +#else +#ifdef PADDLE_WITH_MKLML + EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer +#else + EXPECT_GE(fp_kers.size(), 2UL); // jitcode, refer +#endif +#endif + + auto db_kers = + jit::GetAllCandidateKernels, CPUPlace>(10); +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_GE(db_kers.size(), 1UL); // refer +#else +#ifdef PADDLE_WITH_MKLML + EXPECT_GE(db_kers.size(), 2UL); // mkl, refer +#else + EXPECT_GE(db_kers.size(), 1UL); // refer +#endif +#endif } -TEST(JITKernel, kVIdentity) { - TestXYNKernel(); - TestXYNKernel(); +TEST(JITKernel_helper, GetAllCandidateFuncsWithTypes) { + auto fp_kers = + jit::GetAllCandidateFuncsWithTypes, CPUPlace>(10); +#if defined(__APPLE__) || defined(__OSX__) + EXPECT_GE(fp_kers.size(), 1UL); // refer +#else +#if !defined(PADDLE_WITH_MKLML) || defined(_WIN32) + EXPECT_GE(fp_kers.size(), 2UL); // jitcode/mkl, refer +#else + EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer +#endif +#endif + + auto db_kers = + jit::GetAllCandidateFuncsWithTypes, CPUPlace>(10); +#if defined(__APPLE__) || defined(__OSX__) || !defined(PADDLE_WITH_MKLML) + EXPECT_GE(db_kers.size(), 1UL); // refer +#else + EXPECT_GE(db_kers.size(), 2UL); // mkl, refer +#endif } -TEST(JITKernel, kVSquare) { - TestXYNKernel(); - TestXYNKernel(); +TEST(JITKernel_helper, KernelFuncs) { + auto f1 = jit::KernelFuncs, CPUPlace>::Cache().At(3); + auto f2 = jit::KernelFuncs, CPUPlace>::Cache()[3]; + EXPECT_TRUE(f1 != nullptr); + EXPECT_TRUE(f1 == f2); + + auto f3 = jit::KernelFuncs, CPUPlace>::Cache()[5]; +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_TRUE(f2 == f3); +#else + EXPECT_TRUE(f2 != f3); +#endif } -TEST(JITKernel, kVExp) { - TestXYNKernel(); - TestXYNKernel(); +TEST(JITKernel_helper, GetAllCandidateFuncs) { + auto funcs = jit::GetAllCandidateFuncs, CPUPlace>(10); + auto kers = jit::GetAllCandidateKernels, CPUPlace>(10); + EXPECT_EQ(funcs.size(), kers.size()); + + std::vector x(10), tgt(10); + RandomVec(10, x.data()); + auto best = jit::GetDefaultBestFunc, CPUPlace>(10); + best(x.data(), tgt.data(), 10); + for (auto f : funcs) { + std::vector y(10); + f(x.data(), y.data(), 10); + ExpectEQ(y.data(), tgt.data(), 10); + } } -TEST(JITKernel, kVSigmoid) { - TestXYNKernel(); - TestXYNKernel(); -} +TEST(JITKernel_helper, pack_weights) { + const int N = 8 * 60, K = 2; + float src[K][N], yref[K][N], y[K * N]; + float* x = &(src[0][0]); + float* ref = &(yref[0][0]); + for (int i = 0; i < N * K; ++i) { + *(x + i) = static_cast(i); + } + int block = 0; + std::vector groups; + if (paddle::platform::MayIUse(paddle::platform::avx512f)) { + block = ZMM_FLOAT_BLOCK; + groups.push_back(30); + } else { + block = YMM_FLOAT_BLOCK; + groups.insert(groups.end(), {14, 14, 14, 14, 4}); + } -TEST(JITKernel, kVTanh) { - TestXYNKernel(); - TestXYNKernel(); -} + int offset = 0; + int acc = 0; + for (int g : groups) { + g = g * block; + for (int k = 0; k < K; ++k) { + for (int i = 0; i < g; ++i) { + *(ref + offset) = src[k][i + acc]; + offset++; + } + } + acc += g; + } -// LSTM -TEST(JITKernel, kLSTMCtHt) { - TestLSTMKernel(); - TestLSTMKernel(); + jit::pack_weights(x, y, N, K); + ExpectEQ(y, ref, N * K); } -TEST(JITKernel, kLSTMC1H1) { - TestLSTMKernel(); - TestLSTMKernel(); +TEST(JITKernel_helper, attr) { + std::ostringstream out; + // KernelTypes + out << jit::to_string(jit::kNone) << jit::to_string(jit::kCRFDecoding) + << jit::to_string(jit::kEmbSeqPool) << jit::to_string(jit::kGRUH1) + << jit::to_string(jit::kGRUHtPart1) << jit::to_string(jit::kGRUHtPart2) + << jit::to_string(jit::kHSum) << jit::to_string(jit::kHMax) + << jit::to_string(jit::kLSTMCtHt) << jit::to_string(jit::kLSTMC1H1) + << jit::to_string(jit::kLayerNorm) << jit::to_string(jit::kMatMul) + << jit::to_string(jit::kNCHW16CMulNC) << jit::to_string(jit::kSeqPool) + << jit::to_string(jit::kSoftmax) << jit::to_string(jit::kVAdd) + << jit::to_string(jit::kVAddBias) << jit::to_string(jit::kVAddRelu) + << jit::to_string(jit::kVBroadcast) << jit::to_string(jit::kVCopy) + << jit::to_string(jit::kVExp) << jit::to_string(jit::kVIdentity) + << jit::to_string(jit::kVMul) << jit::to_string(jit::kVRelu) + << jit::to_string(jit::kVScal) << jit::to_string(jit::kSgd) + << jit::to_string(jit::kVSigmoid) << jit::to_string(jit::kVSquare) + << jit::to_string(jit::kVSub) << jit::to_string(jit::kVTanh); + EXPECT_EQ(out.str().size(), 234); + + // SeqPoolTypes + out.str(""); + out << jit::to_string(jit::kSum) << jit::to_string(jit::kAvg) + << jit::to_string(jit::kSqrt); + EXPECT_EQ(out.str().size(), 13); + + EXPECT_EQ(jit::to_kerneltype("relu"), jit::kVRelu); + EXPECT_EQ(jit::to_kerneltype("Identity"), jit::kVIdentity); + EXPECT_EQ(jit::to_kerneltype("VEXP"), jit::kVExp); + EXPECT_EQ(jit::to_kerneltype("SigmoiD"), jit::kVSigmoid); + EXPECT_EQ(jit::to_kerneltype("VTanh"), jit::kVTanh); + + out.str(""); + out << jit::lstm_attr_t(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); + EXPECT_EQ(out.str().size(), 89); + + out.str(""); + out << jit::gru_attr_t(8, jit::kVIdentity, jit::kVSigmoid); + EXPECT_EQ(out.str().size(), 52); + + out.str(""); + out << jit::seq_pool_attr_t(8, jit::SeqPoolType::kSum); + EXPECT_EQ(out.str().size(), 44); + + out.str(""); + out << jit::emb_seq_pool_attr_t(1, 2, 3, 4, 5, jit::SeqPoolType::kAvg); + EXPECT_EQ(out.str().size(), 93); + + out.str(""); + out << jit::sgd_attr_t(1, 2, 3, 4, 5); + EXPECT_EQ(out.str().size(), 81); + + out.str(""); + out << jit::matmul_attr_t(1, 2, 3); + EXPECT_EQ(out.str().size(), 14); } -// GRU -TEST(JITKernel, kGRUH1) { - TestGRUKernel(); - TestGRUKernel(); +// test keys +TEST(JITKernel_key, int) { + EXPECT_TRUE(jit::JitCodeKey(2) == jit::JitCodeKey(2)); + EXPECT_TRUE(jit::JitCodeKey(2) == jit::JitCodeKey(2)); + EXPECT_TRUE(jit::JitCodeKey(2) != jit::JitCodeKey(3)); } -TEST(JITKernel, kGRUHtPart1) { - TestGRUKernel(); - TestGRUKernel(); +TEST(JITKernel_key, gru) { + jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh); + jit::gru_attr_t attr2(8, jit::kVSigmoid, jit::kVTanh); + jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh); + jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity); + jit::gru_attr_t attr5(9, jit::kVTanh, jit::kVIdentity); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + auto key5 = jit::JitCodeKey(attr5); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 != key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key2 != key5); + EXPECT_TRUE(key3 != key4); + EXPECT_TRUE(key3 != key5); + EXPECT_TRUE(key4 != key5); } -TEST(JITKernel, kGRUHtPart2) { - TestGRUKernel(); - TestGRUKernel(); +TEST(JITKernel_key, lstm) { + jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); + jit::lstm_attr_t attr2(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); + jit::lstm_attr_t attr3(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); + jit::lstm_attr_t attr4(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh); + jit::lstm_attr_t attr5(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true); + jit::lstm_attr_t attr6(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + auto key5 = jit::JitCodeKey(attr5); + auto key6 = jit::JitCodeKey(attr6); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 != key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key2 != key5); + EXPECT_TRUE(key3 != key4); + EXPECT_TRUE(key3 != key5); + EXPECT_TRUE(key4 != key5); + EXPECT_TRUE(key5 == key6); } -TEST(JITKernel, kSeqPool) { - TestSeqPoolKernel(); - TestSeqPoolKernel(); +TEST(JITKernel_key, seq_pool) { + jit::seq_pool_attr_t attr1(2, jit::SeqPoolType::kSum, 1); + jit::seq_pool_attr_t attr2(2, jit::SeqPoolType::kSum, 3); + jit::seq_pool_attr_t attr3(3, jit::SeqPoolType::kSum, 3); + jit::seq_pool_attr_t attr4(3, jit::SeqPoolType::kAvg, 3); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 != key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key3 != key4); } -TEST(JITKernel, kMatMul) { - TestMatMulKernel(); - TestMatMulKernel(); +TEST(JITKernel_key, matmul) { + jit::matmul_attr_t attr1(1, 2, 3); + jit::matmul_attr_t attr2(1, 2, 3); + jit::matmul_attr_t attr3(1, 3, 3); + jit::matmul_attr_t attr4(2, 3, 4); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 != key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key3 != key4); } -TEST(JITKernel, kSoftmax) { - TestSoftmaxKernel(); - TestSoftmaxKernel(); +TEST(JITKernel_key, emb_seq_pool) { + jit::emb_seq_pool_attr_t attr1(1, 2, 3, 4, 5, jit::SeqPoolType::kSum); + jit::emb_seq_pool_attr_t attr2(1, 2, 3, 4, 5, jit::SeqPoolType::kSum); + jit::emb_seq_pool_attr_t attr3(10, 2, 9, 8, 7, jit::SeqPoolType::kAvg); + jit::emb_seq_pool_attr_t attr4(10, 3, 9, 8, 7, jit::SeqPoolType::kSum); + jit::emb_seq_pool_attr_t attr5(1, 6, 3, 4, 5, jit::SeqPoolType::kSum); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + auto key5 = jit::JitCodeKey(attr5); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 == key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key2 != key5); + EXPECT_TRUE(key4 != key5); } -TEST(JITKernel, kNCHW16CMulNC) { - TestNCHW16CMulNCKernel(); - TestNCHW16CMulNCKernel(); +TEST(JITKernel_key, sgd) { + jit::sgd_attr_t attr1(1, 2, 3, 4, 5); + jit::sgd_attr_t attr2(1, 2, 3, 4, 5); + jit::sgd_attr_t attr3(9, 8, 7, 4, 6); + jit::sgd_attr_t attr4(1, 2, 3, 6, 5); + jit::sgd_attr_t attr5(10, 9, 8, 7, 6); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + auto key5 = jit::JitCodeKey(attr5); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 == key3); + EXPECT_TRUE(key3 != key4); + EXPECT_TRUE(key3 != key5); + EXPECT_TRUE(key4 != key5); } -// TODO(yihua/TJ): add crf decoding and layer norm unit tests +// test kernerls +#define TestKernelVMul TestKernelXYZN +#define TestKernelVAdd TestKernelXYZN +#define TestKernelVAddRelu TestKernelXYZN +#define TestKernelVSub TestKernelXYZN + +#define TestKernelVScal TestKernelAXYN +#define TestKernelVAddBias TestKernelAXYN + +#define TestKernelVRelu TestKernelXYN +#define TestKernelVIdentity TestKernelXYN +#define TestKernelVSquare TestKernelXYN +#define TestKernelVExp TestKernelXYN +#define TestKernelVSigmoid TestKernelXYN +#define TestKernelVTanh TestKernelXYN +#define TestKernelVCopy TestKernelXYN + +#define TestKernelHMax TestKernelXRN +#define TestKernelHSum TestKernelXRN + +#define TestKernelLSTMCtHt TestKernelLSTM +#define TestKernelLSTMC1H1 TestKernelLSTM + +#define TestKernelGRUH1 TestKernelGRU +#define TestKernelGRUHtPart1 TestKernelGRU +#define TestKernelGRUHtPart2 TestKernelGRU + +#define TEST_CPU_KERNEL(kernel_type) \ + TEST(JITKernel, kernel_type) { \ + TestKernel##kernel_type, CPUPlace>(); \ + TestKernel##kernel_type, CPUPlace>(); \ + } -TEST(JITKernel, pool) { - // TODO(TJ): add some test -} +TEST_CPU_KERNEL(VMul); +TEST_CPU_KERNEL(VAdd); +TEST_CPU_KERNEL(VAddRelu); +TEST_CPU_KERNEL(VSub); + +TEST_CPU_KERNEL(VScal); +TEST_CPU_KERNEL(VAddBias); + +TEST_CPU_KERNEL(VRelu); +TEST_CPU_KERNEL(VIdentity); +TEST_CPU_KERNEL(VSquare); +TEST_CPU_KERNEL(VExp); +TEST_CPU_KERNEL(VSigmoid); +TEST_CPU_KERNEL(VTanh); +TEST_CPU_KERNEL(VCopy); + +TEST_CPU_KERNEL(HMax); +TEST_CPU_KERNEL(HSum); + +TEST_CPU_KERNEL(LSTMCtHt); +TEST_CPU_KERNEL(LSTMC1H1); + +TEST_CPU_KERNEL(GRUH1); +TEST_CPU_KERNEL(GRUHtPart1); +TEST_CPU_KERNEL(GRUHtPart2); + +TEST_CPU_KERNEL(NCHW16CMulNC); +TEST_CPU_KERNEL(LayerNorm); +TEST_CPU_KERNEL(CRFDecoding); + +TEST_CPU_KERNEL(SeqPool); +TEST_CPU_KERNEL(EmbSeqPool); +TEST_CPU_KERNEL(MatMul); +TEST_CPU_KERNEL(Softmax); +TEST_CPU_KERNEL(Sgd); +TEST_CPU_KERNEL(VBroadcast); diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc index f83fe355b85566d229a2673d8f27cfb5ca4831d5..b9db6daf0825b573bfc7f684266212f998c91627 100644 --- a/paddle/fluid/operators/layer_norm_op.cc +++ b/paddle/fluid/operators/layer_norm_op.cc @@ -44,11 +44,11 @@ class LayerNormOp : public framework::OperatorWithKernel { int left = static_cast(matrix_dim[0]); int right = static_cast(matrix_dim[1]); if (ctx->HasInput("Scale")) { - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right); } if (ctx->HasInput("Bias")) { - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right); } diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h index f564a103963bd93732165596712230b0f37f7f26..8627c83b43cc0ff0f56417c0f7f67effa494cd37 100644 --- a/paddle/fluid/operators/layer_norm_op.h +++ b/paddle/fluid/operators/layer_norm_op.h @@ -230,8 +230,8 @@ class LayerNormKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(bias->numel(), right); auto ker = - jit::Get, platform::CPUPlace>( - right); + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(right); ker(x.data(), out.data(), mean->data(), var->data(), scale->data(), bias->data(), static_cast(left), static_cast(epsilon), right); diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc index 1da14631e35608d479e1b861228d52d6d57def79..e17b6cb59898524d793f3cc78a09232f5b664617 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cc +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -144,12 +144,12 @@ class LinearChainCRFOp : public framework::OperatorWithKernel { "Output(LogLikelihood) should be not null."); auto emission_dims = ctx->GetInputDim("Emission"); - PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL, + PADDLE_ENFORCE_EQ(emission_dims.size(), 2, "The Input(Emission) should be a 2-D tensor."); PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed."); auto transition_dims = ctx->GetInputDim("Transition"); - PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL, + PADDLE_ENFORCE_EQ(transition_dims.size(), 2, "The Input(Transition) should be a 2-D tensor."); PADDLE_ENFORCE_EQ( transition_dims[0] - 2, transition_dims[1], @@ -202,13 +202,13 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { "Input(LogLikelihood@GRAD) shoudl be not null."); auto emission_exps_dims = ctx->GetInputDim("EmissionExps"); - PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2UL, + PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2, "The Input(EmissionExps) should be a 2-D tensor."); PADDLE_ENFORCE(emission_exps_dims[0], "An empty mini-batch is not allowed."); auto transition_exps_dims = ctx->GetInputDim("TransitionExps"); - PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2UL, + PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2, "The Input(TransitionExps) should be a 2-D tensor."); PADDLE_ENFORCE_EQ( transition_exps_dims[0] - 2, transition_exps_dims[1], diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h index 7d62d2d020ec2e3a29ad8720a8f04fead3a90a63..ca998826dd0118ab4b1ecc23bed8ef882f1bcc92 100644 --- a/paddle/fluid/operators/lstm_op.h +++ b/paddle/fluid/operators/lstm_op.h @@ -151,9 +151,10 @@ class LSTMKernel : public framework::OpKernel { lstm_value.output_value = out_t.data(); lstm_value.state_value = cell_t.data(); lstm_value.state_active_value = cell_pre_act_t.data(); + T cell_clip = 0.0; math::LstmUnitFunctor::compute( - device_ctx, lstm_value, frame_size, cur_batch_size, gate_act, - cell_act, cand_act); + device_ctx, lstm_value, frame_size, cur_batch_size, cell_clip, + gate_act, cell_act, cand_act); lstm_value.prev_state_value = lstm_value.state_value; } @@ -311,10 +312,15 @@ class LSTMGradKernel : public framework::OpKernel { lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data() : nullptr; } + // lstm_value.output_value not used in bp, set to nullptr + // lstm_grad.state_active_grad not used in bp, set to nullptr + lstm_value.output_value = nullptr; + lstm_grad.state_active_grad = nullptr; int cur_batch_size = bend - bstart; + T cell_clip = 0.0; math::LstmUnitGradFunctor::compute( device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size, - gate_act, cell_act, cand_act); + cell_clip, gate_act, cell_act, cand_act); if (n > 0) { int pre_h_start = static_cast(batch_starts[n - 1]); diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc index 7a62bc9f828e4d3485628747cdf52c60c5354144..2728aa8a4ee21a9e1fe3deddcdba4c35a6aba7bc 100644 --- a/paddle/fluid/operators/lstmp_op.cc +++ b/paddle/fluid/operators/lstmp_op.cc @@ -73,12 +73,6 @@ class LSTMPOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("C0"), "Input(C0) of LSTMP operator should not be null after " "Input(H0) provided."); - auto h_dims = ctx->GetInputDim("H0"); - auto c_dims = ctx->GetInputDim("C0"); - PADDLE_ENFORCE(h_dims == c_dims, - "The dimension of Input(H0) and Input(C0) " - "should be the same."); - ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]}); } auto b_dims = ctx->GetInputDim("Bias"); @@ -180,11 +174,6 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker { "This LoDTensor is obtained in the forward and used in the " "backward.") .AsIntermediate(); - AddOutput("OrderedP0", - "(Tensor) the projection of the initial hidden state " - "H0. This is a tensor with shape (N x P), where N is the " - "batch size and P is the hidden size.") - .AsIntermediate(); AddAttr("use_peepholes", "(bool, defalut: True) " "whether to enable diagonal/peephole connections.") @@ -193,6 +182,16 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, defalut: False) " "whether to compute reversed LSTMP.") .SetDefault(false); + AddAttr("cell_clip", + "(float, defalut: 0.0) " + "Clip for Tensor for cell state tensor when clip value is " + "greater than 0.0") + .SetDefault(0.0); + AddAttr("proj_clip", + "(float, defalut: 0.0) " + "Clip for Tensor for projection tensor when clip value is " + "greater than 0.0") + .SetDefault(0.0); AddAttr( "gate_activation", "(string, default: sigmoid)" diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h index 370dd04d1449a8e211febf9a4f9e90e6f5008e20..c7d6e4205f8862526904e4fa767a2f4c4a2d8481 100644 --- a/paddle/fluid/operators/lstmp_op.h +++ b/paddle/fluid/operators/lstmp_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/activation_op.h" @@ -21,17 +22,50 @@ limitations under the License. */ #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/operators/math/sequence2batch.h" +#include "paddle/fluid/platform/transform.h" namespace paddle { namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; +using platform::Transform; template using EigenMatrix = framework::EigenMatrix; +template +class _ClipFunctor { + public: + explicit _ClipFunctor(const T min, const T max) : min_(min), max_(max) {} + HOSTDEVICE T operator()(const T& x) const { + if (x < min_) + return min_; + else if (x > max_) + return max_; + else + return x; + } + + private: + T min_; + T max_; +}; + +template +class _ClipGradFunctor { + public: + explicit _ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {} + HOSTDEVICE T operator()(const T& x, const T& y) const { + return (y > min_ && y < max_) ? x : 0; + } + + private: + T min_; + T max_; +}; + template inline void ReorderInitState(const DeviceContext& ctx, const framework::Tensor& src, @@ -67,9 +101,11 @@ class LSTMPKernel : public framework::OpKernel { auto* bias = ctx.Input("Bias"); auto* hidden_t0 = ctx.Input("H0"); - auto* ordered_proj0 = ctx.Output("OrderedP0"); auto* cell_t0 = ctx.Input("C0"); + auto proj_clip = static_cast(ctx.Attr("proj_clip")); + auto cell_clip = static_cast(ctx.Attr("cell_clip")); + auto* batch_gate = ctx.Output("BatchGate"); batch_gate->mutable_data(ctx.GetPlace()); auto* proj_out = ctx.Output("Projection"); @@ -110,6 +146,7 @@ class LSTMPKernel : public framework::OpKernel { } lstmp_value.prev_state_value = nullptr; Tensor ordered_c0; + Tensor ordered_h0; framework::Vector order(batch_gate->lod()[2]); @@ -169,18 +206,9 @@ class LSTMPKernel : public framework::OpKernel { // Since the batch computing for LSTMP reorders the input sequence // according to their length. The initialized hidden state also needs // to reorder. - - Tensor ordered_h0; - ordered_proj0->mutable_data(ctx.GetPlace()); ReorderInitState(device_ctx, *hidden_t0, order, &ordered_h0, true); - blas.MatMul(ordered_h0, false, *proj_weight, false, static_cast(1.0), - ordered_proj0, static_cast(0.0)); - if (proj_act != math::detail::ActivationType::kIdentity) { - auto proj0_dev = EigenMatrix::From(*ordered_proj0); - ActCompute(cell_act, place, proj0_dev, proj0_dev); - } - blas.MatMul(*ordered_proj0, false, *weight, false, static_cast(1.0), + blas.MatMul(ordered_h0, false, *weight, false, static_cast(1.0), &gate_t, static_cast(1.0)); } @@ -189,8 +217,8 @@ class LSTMPKernel : public framework::OpKernel { lstmp_value.state_value = cell_t.data(); lstmp_value.state_active_value = cell_pre_act_t.data(); math::LstmUnitFunctor::compute( - device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act, - cell_act, cand_act); + device_ctx, lstmp_value, frame_size, cur_batch_size, cell_clip, + gate_act, cell_act, cand_act); lstmp_value.prev_state_value = lstmp_value.state_value; blas.MatMul(hidden_t, false, *proj_weight, false, static_cast(1.0), &proj_t, static_cast(0.0)); @@ -198,6 +226,14 @@ class LSTMPKernel : public framework::OpKernel { auto proj_t_dev = EigenMatrix::From(proj_t); ActCompute(cell_act, place, proj_t_dev, proj_t_dev); } + if (proj_clip && proj_clip > 0.0) { + T* x_data = proj_t.data(); + int64_t numel = proj_t.numel(); + Transform trans; + trans(ctx.template device_context(), x_data, + x_data + numel, x_data, + _ClipFunctor(-1.0 * proj_clip, proj_clip)); + } } math::Batch2LoDTensorFunctor to_seq; @@ -239,6 +275,9 @@ class LSTMPGradKernel : public framework::OpKernel { auto* proj_out = ctx.Input("Projection"); auto* cell_out = ctx.Input("Cell"); + auto proj_clip = static_cast(ctx.Attr("proj_clip")); + auto cell_clip = static_cast(ctx.Attr("cell_clip")); + auto* batch_gate = ctx.Input("BatchGate"); auto* batch_cell_pre_act = ctx.Input("BatchCellPreAct"); auto* batch_hidden = ctx.Input("BatchHidden"); @@ -253,7 +292,6 @@ class LSTMPGradKernel : public framework::OpKernel { auto* bias_g = ctx.Output(framework::GradVarName("Bias")); auto* h0 = ctx.Input("H0"); - auto* ordered_proj0 = ctx.Input("OrderedP0"); auto* c0 = ctx.Input("C0"); auto* h0_g = ctx.Output(framework::GradVarName("H0")); @@ -363,6 +401,17 @@ class LSTMPGradKernel : public framework::OpKernel { Tensor cur_proj = batch_proj.Slice(bstart, bend); Tensor proj_g = batch_proj_g.Slice(bstart, bend); + + if (proj_clip && proj_clip > 0.0) { + T* dx_data = proj_g.data(); + T* x_data = cur_proj.data(); + int64_t numel = proj_g.numel(); + Transform trans; + trans(ctx.template device_context(), dx_data, + dx_data + numel, x_data, dx_data, + _ClipGradFunctor(-1.0 * proj_clip, proj_clip)); + } + if (proj_act != math::detail::ActivationType::kIdentity) { auto cur_proj_dev = EigenMatrix::From(cur_proj); auto proj_g_dev = EigenMatrix::From(proj_g); @@ -405,9 +454,14 @@ class LSTMPGradKernel : public framework::OpKernel { } int cur_batch_size = bend - bstart; + // lstmp_value.output_value not used in bp, set to null + // lstmp_grad.state_active_grad not used in bp, set to null + lstmp_value.output_value = nullptr; + lstmp_grad.state_active_grad = nullptr; + math::LstmUnitGradFunctor::compute( device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size, - gate_act, cell_act, cand_act); + cell_clip, gate_act, cell_act, cand_act); if (n > 0) { int pre_h_start = static_cast(batch_starts[n - 1]); @@ -426,31 +480,14 @@ class LSTMPGradKernel : public framework::OpKernel { ReorderInitState(device_ctx, *h0, order, &ordered_h0, true); if (weight_g) { - blas.MatMul(*ordered_proj0, true, gate_g, false, - static_cast(1.0), weight_g, static_cast(1.0)); + blas.MatMul(ordered_h0, true, gate_g, false, static_cast(1.0), + weight_g, static_cast(1.0)); } } if (h0 && (h0_g || proj_weight_g)) { ordered_h0_g.mutable_data(h0_g->dims(), ctx.GetPlace()); - Tensor proj0_g; - proj0_g.Resize({in_dims[0], proj_weight->dims()[1]}); - proj0_g.mutable_data(ctx.GetPlace()); blas.MatMul(gate_g, false, *weight, true, static_cast(1.0), - &proj0_g, static_cast(0.0)); - if (proj_act != math::detail::ActivationType::kIdentity) { - auto proj0_dev = EigenMatrix::From(*ordered_proj0); - auto proj0_g_dev = EigenMatrix::From(proj0_g); - ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev, - proj0_g_dev); - } - if (h0_g) { - blas.MatMul(proj0_g, false, *proj_weight, true, static_cast(1.0), - &ordered_h0_g, static_cast(0.0)); - } - if (proj_weight_g) { - blas.MatMul(ordered_h0, true, proj0_g, false, static_cast(1.0), - proj_weight_g, static_cast(1.0)); - } + &ordered_h0_g, static_cast(0.0)); } } } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 4b6eef18d8b967af5f3a5df0dee750620e7e412a..d4837696241b8c4e3cca4f2afe872c6be559853c 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -39,6 +39,7 @@ math_library(cross_entropy) math_library(cos_sim_functor) math_library(depthwise_conv DEPS cub) math_library(im2col) +math_library(sample_prob) math_library(sampler) math_library(gru_compute DEPS activation_functions math_function) diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc index 69971ef7423eff6bc3f8543a491edb6b0bbd00ca..0155ef188ef967fbf67505d28beeeaf956bb3a70 100644 --- a/paddle/fluid/operators/math/beam_search.cc +++ b/paddle/fluid/operators/math/beam_search.cc @@ -56,15 +56,15 @@ class BeamSearchFunctor { // the output tensor shape should be [num_instances, 1] auto dims = framework::make_ddim( std::vector({static_cast(num_instances), 1})); - selected_ids->Resize(dims); - selected_scores->Resize(dims); - parent_idx->Resize({static_cast(num_instances)}); - auto *selected_ids_data = - selected_ids->mutable_data(platform::CPUPlace()); + selected_ids->mutable_data(dims, platform::CPUPlace()); auto *selected_scores_data = - selected_scores->mutable_data(platform::CPUPlace()); - auto *parent_idx_data = parent_idx->mutable_data(platform::CPUPlace()); + selected_scores->mutable_data(dims, platform::CPUPlace()); + auto *parent_idx_data = + parent_idx + ? parent_idx->mutable_data( + {static_cast(num_instances)}, platform::CPUPlace()) + : nullptr; // fill in data std::vector low_level; @@ -72,7 +72,9 @@ class BeamSearchFunctor { for (auto &items : selected_items) { low_level.push_back(low_offset); for (auto &item : items) { - parent_idx_data[low_offset] = static_cast(low_level.size() - 1); + if (parent_idx) { + parent_idx_data[low_offset] = static_cast(low_level.size() - 1); + } selected_ids_data[low_offset] = item.id; selected_scores_data[low_offset] = item.score; low_offset++; diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu index 61d021ef627f1ccd90b992c2078a7f3ca879422d..ecfeba338482a99735488fec08be8c3adcf4d0f4 100644 --- a/paddle/fluid/operators/math/beam_search.cu +++ b/paddle/fluid/operators/math/beam_search.cu @@ -119,6 +119,18 @@ __device__ __forceinline__ int SelectTopBeam( __syncthreads(); } + if ((num_used_threads & 0x1) != 0) { + // If num_used_threads is a odd number, merge local top_beam of thread 0 + // and num_used_threads - 1 + if (tid_of_seq == 0) { + int index_in_sh = (num_used_threads - 1 + tid) * beam_size; + for (int i = 0; i < beam_size; i++) { + Insert(top_beam_local, top_beam[index_in_sh], beam_size); + index_in_sh++; + } + } + } + num_used_threads = num_used_threads >> 1; if (tid_of_seq < num_used_threads) { int index_in_sh = (num_used_threads + tid) * beam_size; @@ -156,6 +168,7 @@ __device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local, return finish_flag; } +template __device__ __forceinline__ void WriteBack( int64_t* selected_ids, float* selected_scores, int* parent_idx, size_t* selected_offsets, Triple* top_beam_local, @@ -171,7 +184,9 @@ __device__ __forceinline__ void WriteBack( selected_ids[global_index] = static_cast(top_beam_local[local_index].id); selected_scores[global_index] = top_beam_local[local_index].score; - parent_idx[global_index] = static_cast(global_offset); + if (ReturnParentIdx) { + parent_idx[global_index] = static_cast(global_offset); + } global_index++; } } @@ -229,9 +244,15 @@ __device__ void BeamSearchDetails( selected_offsets[0] = 0; } - WriteBack(selected_ids, selected_scores, parent_idx, selected_offsets, - top_beam_local, seq_offset_start, seq_offset_end, - selected_seq_start, selected_seq_length); + if (parent_idx) { + WriteBack(selected_ids, selected_scores, parent_idx, + selected_offsets, top_beam_local, seq_offset_start, + seq_offset_end, selected_seq_start, selected_seq_length); + } else { + WriteBack(selected_ids, selected_scores, parent_idx, + selected_offsets, top_beam_local, seq_offset_start, + seq_offset_end, selected_seq_start, selected_seq_length); + } } } @@ -325,8 +346,12 @@ class BeamSearchFunctor { selected_ids->mutable_data(selected_dims, context.GetPlace()); float* selected_scores_data = selected_scores->mutable_data(selected_dims, context.GetPlace()); - int* parent_idx_data = parent_idx->mutable_data( - {static_cast(num_seqs * beam_size)}, context.GetPlace()); + int* parent_idx_data = + parent_idx + ? parent_idx->mutable_data( + {static_cast(num_seqs * beam_size)}, + context.GetPlace()) + : nullptr; framework::LoD selected_lod(2); selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end()); @@ -384,7 +409,9 @@ class BeamSearchFunctor { {static_cast(selected_lod[1].back()), 1}); selected_ids->Resize(final_selected_dims); selected_scores->Resize(final_selected_dims); - parent_idx->Resize({static_cast(selected_lod[1].back())}); + if (parent_idx) { + parent_idx->Resize({static_cast(selected_lod[1].back())}); + } } } }; diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index f67f57827bc03e134bf87edd5bf033adb5098916..ce8109f64d62b0d412419107881952f1b4ffc75e 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -184,6 +184,9 @@ class Blas { template void VINV(int n, const T* a, T* y) const; + template + void VMERF(int n, const T* a, T* y, int64_t mode) const; + private: const DeviceContext& context_; }; @@ -290,6 +293,11 @@ class BlasT : private Blas { Base()->template VINV(args...); } + template + void VMERF(ARGS... args) const { + Base()->template VMERF(args...); + } + private: const Blas* Base() const { return static_cast*>(this); diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 972366bc093f4b7f0a090cf31213f75ccd89fd82..ba995dabecbfab8c4952bb7efeaa381f8078821a 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -123,6 +123,11 @@ struct CBlas { static void VINV(ARGS... args) { platform::dynload::vsInv(args...); } + + template + static void VMERF(ARGS... args) { + platform::dynload::vmsErf(args...); + } }; template <> @@ -223,6 +228,11 @@ struct CBlas { static void VINV(ARGS... args) { platform::dynload::vdInv(args...); } + + template + static void VMERF(ARGS... args) { + platform::dynload::vmdErf(args...); + } }; #else @@ -625,6 +635,19 @@ void Blas::VINV(int n, const T *a, T *y) const { #endif } +template <> +template +void Blas::VMERF(int n, const T *a, T *y, + int64_t mode) const { +#ifdef PADDLE_WITH_MKLML + CBlas::VMERF(n, a, y, mode); +#else + for (int i = 0; i < n; ++i) { + y[i] = std::erf(a[i]); + } +#endif +} + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h index 2e3779ff0845294e71f27801049c010e0a585e6b..ad79c58063a8a12c703979fe32a8e671a5ade857 100644 --- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h @@ -32,7 +32,8 @@ namespace detail { template void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, - int frame_size, ActivationType active_node, + int frame_size, T cell_clip, + ActivationType active_node, ActivationType active_gate, ActivationType active_state) { T r_value_in; @@ -67,7 +68,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state, &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO, - active_node, active_gate, active_state); + &cell_clip, active_node, active_gate, active_state); value_in[i] = r_value_in; value_ig[i] = r_value_ig; @@ -82,7 +83,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, template void naive_lstm_backward_one_sequence(Op op, LstmMetaValue value, LstmMetaGrad grad, int frame_size, - ActivationType active_node, + T cell_clip, ActivationType active_node, ActivationType active_gate, ActivationType active_state) { T r_value_in; @@ -135,7 +136,7 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue value, &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, - active_node, active_gate, active_state); + &cell_clip, active_node, active_gate, active_state); grad_in[i] = r_grad_in; grad_ig[i] = r_grad_ig; @@ -154,7 +155,8 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue value, template void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, - int frame_size, ActivationType active_node, + int frame_size, T cell_clip, + ActivationType active_node, ActivationType active_gate, ActivationType active_state) { #ifdef __AVX__ @@ -194,7 +196,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state, &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO, - active_node, active_gate, active_state); + &cell_clip, active_node, active_gate, active_state); value_in[i] = r_value_in; value_ig[i] = r_value_ig; @@ -210,7 +212,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, template void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, LstmMetaGrad grad, int frame_size, - ActivationType active_node, + T cell_clip, ActivationType active_node, ActivationType active_gate, ActivationType active_state) { #ifdef __AVX__ @@ -268,7 +270,7 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, - active_node, active_gate, active_state); + &cell_clip, active_node, active_gate, active_state); grad_in[i] = r_grad_in; grad_ig[i] = r_grad_ig; @@ -292,27 +294,27 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, template void cpu_lstm_forward(Op op, LstmMetaValue value, int frame_size, - ActivationType active_node, ActivationType active_gate, - ActivationType active_state) { + T cell_clip, ActivationType active_node, + ActivationType active_gate, ActivationType active_state) { if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same::value)) { - avx_lstm_forward_one_sequence(op, value, frame_size, active_node, - active_gate, active_state); + avx_lstm_forward_one_sequence(op, value, frame_size, cell_clip, + active_node, active_gate, active_state); } else { - naive_lstm_forward_one_sequence(op, value, frame_size, active_node, - active_gate, active_state); + naive_lstm_forward_one_sequence(op, value, frame_size, cell_clip, + active_node, active_gate, active_state); } } template void cpu_lstm_backward(Op op, LstmMetaValue value, LstmMetaGrad grad, - int frame_size, ActivationType active_node, + int frame_size, T cell_clip, ActivationType active_node, ActivationType active_gate, ActivationType active_state) { if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same::value)) { - avx_lstm_backward_one_sequence(op, value, grad, frame_size, active_node, - active_gate, active_state); + avx_lstm_backward_one_sequence(op, value, grad, frame_size, cell_clip, + active_node, active_gate, active_state); } else { - naive_lstm_backward_one_sequence(op, value, grad, frame_size, + naive_lstm_backward_one_sequence(op, value, grad, frame_size, cell_clip, active_node, active_gate, active_state); } } diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h index 2aecb69237fdf344ebc0bfe72d9c7c147f06358d..e0ca9e7f5b2f4a8bb837768d645b5103aa3e6760 100644 --- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h @@ -31,7 +31,8 @@ namespace detail { */ template __global__ void KeLstmForward(Op op, LstmMetaValue value, int frame_size, - int batch_size, ActivationType active_node, + int batch_size, T cell_clip, + ActivationType active_node, ActivationType active_gate, ActivationType active_state) { const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -72,7 +73,7 @@ __global__ void KeLstmForward(Op op, LstmMetaValue value, int frame_size, op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state, &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO, - active_node, active_gate, active_state); + &cell_clip, active_node, active_gate, active_state); value.gate_value[frame_idx] = r_value_in; value.gate_value[frame_idx + frame_size] = r_value_ig; @@ -91,7 +92,8 @@ __global__ void KeLstmForward(Op op, LstmMetaValue value, int frame_size, template __global__ void KeLstmBackward(Op op, LstmMetaValue value, LstmMetaGrad grad, int frame_size, - int batch_size, ActivationType active_node, + int batch_size, T cell_clip, + ActivationType active_node, ActivationType active_gate, ActivationType active_state) { const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -148,8 +150,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue value, op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, &r_checkF, - &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, active_node, - active_gate, active_state); + &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, &cell_clip, + active_node, active_gate, active_state); grad.gate_grad[frame_idx] = r_grad_in; grad.gate_grad[frame_idx + frame_size] = r_grad_ig; @@ -185,8 +187,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue value, template void gpu_lstm_forward(const platform::DeviceContext& context, Op op, LstmMetaValue value, int frame_size, int batch_size, - ActivationType active_node, ActivationType active_gate, - ActivationType active_state) { + T cell_clip, ActivationType active_node, + ActivationType active_gate, ActivationType active_state) { dim3 threads; dim3 grid; if (batch_size == 1) { @@ -205,12 +207,12 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op, if (batch_size == 1) { KeLstmForward<<>>( - op, value, frame_size, batch_size, active_node, active_gate, + op, value, frame_size, batch_size, cell_clip, active_node, active_gate, active_state); } else { KeLstmForward<<>>( - op, value, frame_size, batch_size, active_node, active_gate, + op, value, frame_size, batch_size, cell_clip, active_node, active_gate, active_state); } } @@ -218,7 +220,7 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op, template void gpu_lstm_backward(const platform::DeviceContext& context, Op op, LstmMetaValue value, LstmMetaGrad grad, - int frame_size, int batch_size, + int frame_size, int batch_size, T cell_clip, ActivationType active_node, ActivationType active_gate, ActivationType active_state) { dim3 threads; @@ -239,13 +241,13 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op, if (batch_size == 1) { KeLstmBackward<<>>( - op, value, grad, frame_size, batch_size, active_node, active_gate, - active_state); + op, value, grad, frame_size, batch_size, cell_clip, active_node, + active_gate, active_state); } else { KeLstmBackward<<>>( - op, value, grad, frame_size, batch_size, active_node, active_gate, - active_state); + op, value, grad, frame_size, batch_size, cell_clip, active_node, + active_gate, active_state); } } diff --git a/paddle/fluid/operators/math/detail/lstm_kernel.h b/paddle/fluid/operators/math/detail/lstm_kernel.h index cbe73d62938d7c4c03a2c8731665260624417fd7..8149686c97a030b91e0c4de708b9abf07f83203d 100644 --- a/paddle/fluid/operators/math/detail/lstm_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_kernel.h @@ -29,7 +29,7 @@ class lstm { public: HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og, T *prev_state, T *state, T *state_atv, T *output, - T *checkI, T *checkF, T *checkO, + T *checkI, T *checkF, T *checkO, T *cell_clip, ActivationType active_node, ActivationType active_gate, ActivationType active_state) { @@ -37,6 +37,15 @@ class lstm { *value_ig = activation(*value_ig + (*prev_state) * (*checkI), active_gate); *value_fg = activation(*value_fg + (*prev_state) * (*checkF), active_gate); *state = (*value_in) * (*value_ig) + (*prev_state) * (*value_fg); + + if (*cell_clip > 0.0) { + if (*state < -1.0 * (*cell_clip)) { + *state = -1.0 * (*cell_clip); + } + if (*state > *cell_clip) { + *state = *cell_clip; + } + } *value_og = activation(*value_og + (*state) * (*checkO), active_gate); *state_atv = activation(*state, active_state); *output = (*value_og) * (*state_atv); @@ -52,7 +61,7 @@ class lstm { __m256 *value_fg, __m256 *value_og, __m256 *prev_state, __m256 *state, __m256 *state_atv, __m256 *output, __m256 *checkI, - __m256 *checkF, __m256 *checkO, + __m256 *checkF, __m256 *checkO, T *cell_clip, ActivationType active_node, ActivationType active_gate, ActivationType active_state) { @@ -65,6 +74,13 @@ class lstm { active_gate); *state = _mm256_add_ps(_mm256_mul_ps(*value_in, *value_ig), _mm256_mul_ps(*prev_state, *value_fg)); + + if (*cell_clip > 0.0f) { + __m256 min = _mm256_set1_ps(0.0f - *cell_clip); + __m256 max = _mm256_set1_ps(*cell_clip); + *state = _mm256_min_ps(max, *state); + *state = _mm256_max_ps(min, *state); + } *value_og = activation( _mm256_add_ps(*value_og, _mm256_mul_ps(*state, *checkO)), active_gate); *state_atv = activation(*state, active_state); @@ -86,15 +102,26 @@ class lstm { T *prev_state, T *prev_state_grad, T *state, T *state_grad, T *state_atv, T *output_grad, T *checkI, T *checkF, T *checkO, T *checkIGrad, - T *checkFGrad, T *checkOGrad, + T *checkFGrad, T *checkOGrad, T *cell_clip, ActivationType active_node, ActivationType active_gate, ActivationType active_state) { *grad_og = activation((*output_grad) * (*state_atv), *value_og, active_gate); - *state_grad += - activation((*output_grad) * (*value_og), *state_atv, active_state) + - (*grad_og) * (*checkO); + if (*cell_clip > 0.0f) { + if (*state >= (*cell_clip) || *state <= (0.0f - (*cell_clip))) { + *state_grad = 0.0f; + } else { + *state_grad += + activation((*output_grad) * (*value_og), *state_atv, active_state) + + (*grad_og) * (*checkO); + } + } else { + *state_grad += + activation((*output_grad) * (*value_og), *state_atv, active_state) + + (*grad_og) * (*checkO); + } + *grad_in = activation((*state_grad) * (*value_ig), *value_in, active_node); *grad_ig = activation((*state_grad) * (*value_in), *value_ig, active_gate); *grad_fg = @@ -117,15 +144,24 @@ class lstm { __m256 *prev_state, __m256 *prev_state_grad, __m256 *state, __m256 *state_grad, __m256 *state_atv, __m256 *output_grad, __m256 *checkI, __m256 *checkF, __m256 *checkO, __m256 *checkIGrad, - __m256 *checkFGrad, __m256 *checkOGrad, ActivationType active_node, - ActivationType active_gate, ActivationType active_state) { + __m256 *checkFGrad, __m256 *checkOGrad, T *cell_clip, + ActivationType active_node, ActivationType active_gate, + ActivationType active_state) { *grad_og = activation(_mm256_mul_ps(*output_grad, *state_atv), *value_og, active_gate); - *state_grad = - _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og), - *state_atv, active_state), - *state_grad); - *state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad); + if (*cell_clip > 0.0f) { + T *state_ = reinterpret_cast(state); + if (*state_ >= (*cell_clip) || *state_ <= (0.0f - (*cell_clip))) { + *state_grad = _mm256_set1_ps(0.0f); + } else { + *state_grad = + _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og), + *state_atv, active_state), + *state_grad); + *state_grad = + _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad); + } + } *grad_in = activation(_mm256_mul_ps(*state_grad, *value_ig), *value_in, active_node); *grad_ig = activation(_mm256_mul_ps(*state_grad, *value_in), *value_ig, diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index 0ad57c51be79cd3577b43c9af777bff710308fac..66ce57594a14d8c94737b5dbe83af413628ef1cf 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -30,17 +30,16 @@ inline void FCCompute(const BlasT& blas, const int M, return; } if (relu) { - auto compute = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(N); + auto compute = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + N); for (int i = 0; i < M; i++) { T* dst = Y + i * N; compute(B, dst, dst, N); } } else { - auto compute = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(N); + auto compute = + jit::KernelFuncs, platform::CPUPlace>::Cache().At(N); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif diff --git a/paddle/fluid/operators/math/lstm_compute.cc b/paddle/fluid/operators/math/lstm_compute.cc index b6882b4fd8e6db8592a282410888d5625bae742a..94bbcbb50670d9f0b11b77cf6a54a99c227521bf 100644 --- a/paddle/fluid/operators/math/lstm_compute.cc +++ b/paddle/fluid/operators/math/lstm_compute.cc @@ -24,12 +24,12 @@ template struct LstmUnitFunctor { static void compute(const platform::CPUDeviceContext& context, LstmMetaValue value, int frame_size, int batch_size, - const detail::ActivationType& gate_act, + T cell_clip, const detail::ActivationType& gate_act, const detail::ActivationType& cell_act, const detail::ActivationType& cand_act) { for (int b = 0; b < batch_size; b++) { detail::cpu_lstm_forward(detail::forward::lstm(), value, frame_size, - cand_act, gate_act, cell_act); + cell_clip, cand_act, gate_act, cell_act); value.gate_value += frame_size * 4; value.state_value += frame_size; value.state_active_value += frame_size; @@ -45,13 +45,14 @@ template struct LstmUnitGradFunctor { static void compute(const platform::CPUDeviceContext& context, LstmMetaValue value, LstmMetaGrad grad, - int frame_size, int batch_size, + int frame_size, int batch_size, T cell_clip, const detail::ActivationType& gate_act, const detail::ActivationType& cell_act, const detail::ActivationType& cand_act) { for (int b = 0; b < batch_size; b++) { detail::cpu_lstm_backward(detail::backward::lstm(), value, grad, - frame_size, cand_act, gate_act, cell_act); + frame_size, cell_clip, cand_act, gate_act, + cell_act); value.gate_value += frame_size * 4; value.state_value += frame_size; diff --git a/paddle/fluid/operators/math/lstm_compute.cu b/paddle/fluid/operators/math/lstm_compute.cu index 1233000083d6efc31fcbc527e8e9efb83224b4e3..e7445d3d40ae92ff66e7d33a38bfdebfc8455f0a 100644 --- a/paddle/fluid/operators/math/lstm_compute.cu +++ b/paddle/fluid/operators/math/lstm_compute.cu @@ -24,12 +24,12 @@ template struct LstmUnitFunctor { static void compute(const platform::CUDADeviceContext& context, LstmMetaValue value, int frame_size, int batch_size, - const detail::ActivationType& gate_act, + T cell_clip, const detail::ActivationType& gate_act, const detail::ActivationType& cell_act, const detail::ActivationType& cand_act) { detail::gpu_lstm_forward(context, detail::forward::lstm(), value, - frame_size, batch_size, cand_act, gate_act, - cell_act); + frame_size, batch_size, cell_clip, cand_act, + gate_act, cell_act); } }; @@ -37,13 +37,13 @@ template struct LstmUnitGradFunctor { static void compute(const platform::CUDADeviceContext& context, LstmMetaValue value, LstmMetaGrad grad, - int frame_size, int batch_size, + int frame_size, int batch_size, T cell_clip, const detail::ActivationType& gate_act, const detail::ActivationType& cell_act, const detail::ActivationType& cand_act) { detail::gpu_lstm_backward(context, detail::backward::lstm(), value, grad, - frame_size, batch_size, cand_act, gate_act, - cell_act); + frame_size, batch_size, cell_clip, cand_act, + gate_act, cell_act); } }; diff --git a/paddle/fluid/operators/math/lstm_compute.h b/paddle/fluid/operators/math/lstm_compute.h index ca2f78e6f318ce39bd2272bbce20f6a6f98fe430..80af5639387aaf6a983365e13c3478353c27a617 100644 --- a/paddle/fluid/operators/math/lstm_compute.h +++ b/paddle/fluid/operators/math/lstm_compute.h @@ -50,7 +50,7 @@ template class LstmUnitFunctor { public: static void compute(const DeviceContext &context, LstmMetaValue value, - int frame_size, int batch_size, + int frame_size, int batch_size, T cell_clip, const detail::ActivationType &gate_act, const detail::ActivationType &cell_act, const detail::ActivationType &cand_act); @@ -61,7 +61,7 @@ class LstmUnitGradFunctor { public: static void compute(const DeviceContext &context, LstmMetaValue value, LstmMetaGrad grad, int frame_size, int batch_size, - const detail::ActivationType &gate_act, + T cell_clip, const detail::ActivationType &gate_act, const detail::ActivationType &cell_act, const detail::ActivationType &cand_act); }; diff --git a/paddle/fluid/operators/math/sample_prob.cc b/paddle/fluid/operators/math/sample_prob.cc new file mode 100644 index 0000000000000000000000000000000000000000..99aa318453eae161807353198a78e11085cd6237 --- /dev/null +++ b/paddle/fluid/operators/math/sample_prob.cc @@ -0,0 +1,26 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/sample_prob.h" + +namespace paddle { +namespace operators { +namespace math { + +template class SampleWithProb; +template class SampleWithProb; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu new file mode 100644 index 0000000000000000000000000000000000000000..8f9391591560cc3f76ac67f43121c4b1cff90e12 --- /dev/null +++ b/paddle/fluid/operators/math/sample_prob.cu @@ -0,0 +1,161 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sample_prob.h" +#include "paddle/fluid/operators/math/sampler.h" + +namespace paddle { +namespace operators { +namespace math { + +using Tensor = framework::Tensor; + +template +__device__ T gpu_adjust_prob(const T prob, const int num_samples, + const int num_tries) { + if (num_samples == num_tries) { + return prob * num_samples; + } else { + return -expm1(num_tries * log1p(-prob)); + } +} + +class GPULogUniformSampler { + public: + __device__ int64_t Sample(float random, const int range, + const float log_range) const; + __device__ float Probability(int64_t value, const float log_range) const; +}; + +__device__ int64_t GPULogUniformSampler::Sample(float random, const int range, + const float log_range) const { + // Got Log Uniform distribution from uniform distribution by + // inverse_transform_sampling method + const int64_t value = static_cast(exp(random * log_range)) - 1; + // Mathematically, value should be <= range_, but might not be due to some + // floating point roundoff, so we mod by range_. + return value % range; +} + +__device__ float GPULogUniformSampler::Probability( + int64_t value, const float log_range) const { + // Given f(x) = 1/[(x+1) * log_range_] + // The value's probability is integral of f(x) from value to (value + 1) + return (log((value + 2.0) / (value + 1.0))) / log_range; +} + +template +__global__ void SamplingCondidate( + const size_t n, const int num_tries, const int range, const float log_range, + const int num_true, const std::size_t num_samples, + const int64_t* label_data, int64_t* samples_data, T* probabilities_data) { + const int num_sampled_classes = num_true + num_samples; + + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int step_size = 0; + GPULogUniformSampler sampler; + + for (; idx < n; idx += blockDim.x * gridDim.x) { + int col_idx = idx % num_sampled_classes; + int row_idx = idx / num_sampled_classes; + if (col_idx < num_true) { + samples_data[idx] = label_data[row_idx * num_true + col_idx]; + } else { + samples_data[idx] = samples_data[col_idx]; + } + probabilities_data[idx] = sampler.Probability(samples_data[idx], log_range); + probabilities_data[idx] = + gpu_adjust_prob(probabilities_data[idx], num_samples, num_tries); + } +} + +template +int UniqSampler(const Sampler& sampler, const std::size_t num_samples, + int64_t* samples_data) { + // sample num_samles unique samples for an example, note that they are not + // all negative samples + std::unordered_set tmp_samples; + tmp_samples.clear(); + int num_tries = 0; + int j = 0; + while (j < num_samples) { + ++num_tries; + auto v = sampler.Sample(); + auto insert_ok = tmp_samples.insert(v).second; + if (!insert_ok) { + continue; + } + samples_data[j] = v; + ++j; + } + return num_tries; +} + +template +void GPUSampleWithProb::operator()( + const platform::CUDADeviceContext& context, const int seed, + const int dict_size, const bool uniq, const std::size_t num_samples, + const Tensor* L, Tensor* S, Tensor* P) { + // UNDERSTAND: dimension issues + const auto lbl_dim = L->dims(); + const int batch_size = lbl_dim[0]; + const int num_true = lbl_dim[1]; + const int num_sampled_classes = num_true + num_samples; + framework::DDim ret_dim{batch_size, num_sampled_classes}; + + // UNDERSTAND: raw data view + const int64_t* label_data = L->data(); + int64_t* samples_data = S->data(); + T* probabilities_data = P->data(); + + int s_size = num_samples; + framework::DDim s_dim{s_size}; + Tensor s; + int64_t* s_data = s.mutable_data(s_dim, platform::CPUPlace()); + + math::LogUniformSampler sampler(dict_size, seed); + + int range = dict_size; + float log_range = log(range + 1); + + int num_tries = UniqSampler(sampler, num_samples, s_data); + VLOG(1) << "num_tries: " << num_tries; + PADDLE_ENFORCE(cudaMemcpy(samples_data + num_true, s_data, + sizeof(int64_t) * num_samples, + cudaMemcpyHostToDevice)); + + int threads = 512; + const size_t size = batch_size * num_sampled_classes; + int grid = (batch_size * num_sampled_classes + threads - 1) / threads; + SamplingCondidate<<>>( + size, num_tries, range, log_range, num_true, num_samples, label_data, + samples_data, probabilities_data); +} + +template class GPUSampleWithProb; +template class GPUSampleWithProb; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h new file mode 100644 index 0000000000000000000000000000000000000000..e5a6d84cb2b0527c606e62a19ef02d669945ecb1 --- /dev/null +++ b/paddle/fluid/operators/math/sample_prob.h @@ -0,0 +1,118 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/math/sampler.h" + +namespace paddle { +namespace operators { +namespace math { + +using Tensor = framework::Tensor; + +/* UNDERSTAND: utility function to adjust probability for unique sampling, +return whatever as it is if not using unique samping */ +template +static T adjust_prob(const T prob, const int num_samples, const int num_tries) { + if (num_samples == num_tries) { + return prob * num_samples; + } else { + return -expm1(num_tries * log1p(-prob)); + } +} + +template +class SampleWithProb { + public: + void operator()(const DeviceContext& context, const Sampler& sampler, + const std::size_t num_samples, const Tensor* L, Tensor* S, + Tensor* P) { + // UNDERSTAND: dimension issues + const auto lbl_dim = L->dims(); + const int batch_size = lbl_dim[0]; + const int num_true = lbl_dim[1]; + const int num_sampled_classes = num_true + num_samples; + framework::DDim ret_dim{batch_size, num_sampled_classes}; + + // UNDERSTAND: raw data view + const int64_t* label_data = L->data(); + int64_t* samples_data = + S->mutable_data(ret_dim, context.GetPlace()); + T* probabilities_data = P->mutable_data(ret_dim, context.GetPlace()); + + // temp sets for unique sampling + std::unordered_set tmp_samples; + int j = 0; // column index + // add true labels, not that efficient + while (j < num_true) { + for (int i = 0; i < batch_size; ++i) { + auto samples_index = i * num_sampled_classes + j; + auto v = label_data[i * num_true + j]; + samples_data[samples_index] = v; + probabilities_data[samples_index] = sampler.Probability(v); + } + ++j; + } + + // sample num_samles unique samples for an example, note that they are not + // all negative samples + tmp_samples.clear(); + int num_tries = 0; + while (j < num_sampled_classes) { + ++num_tries; + auto v = sampler.Sample(); + auto insert_ok = tmp_samples.insert(v).second; + if (!insert_ok) { + continue; + } + auto p = sampler.Probability(v); + for (int i = 0; i < batch_size; ++i) { + auto samples_index = i * num_sampled_classes + j; + samples_data[samples_index] = v; + probabilities_data[samples_index] = p; + } + ++j; + } + + // compute Q(y|x), because of unique sampling, probabilities need to be + // adjusted + for (int k = 0; k < num_sampled_classes; ++k) { + for (int i = 0; i < batch_size; ++i) { + auto samples_index = i * num_sampled_classes + k; + probabilities_data[samples_index] = adjust_prob( + probabilities_data[samples_index], num_samples, num_tries); + } + } + } +}; + +#ifdef PADDLE_WITH_CUDA +template +class GPUSampleWithProb { + public: + void operator()(const platform::CUDADeviceContext& context, const int seed, + const int dict_size, const bool uniq, + const std::size_t num_samples, const Tensor* L, Tensor* S, + Tensor* P); +}; +#endif +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 2a47502614b9cd3df4583992669ab4bf78228181..7af44f2b2ca56f615ca0c8ad4590958af2abe9eb 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -256,8 +256,8 @@ class SequencePoolFunctor { static_cast(input.numel() / input.dims()[0]), jit::SeqPoolType::kSum); auto seqpool = - jit::Get, platform::CPUPlace>( - attr); + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(attr); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { attr.h = static_cast(lod[i + 1] - lod[i]); seqpool(src, dst, &attr); diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index a1cb3f972826a67721b00ce6df0ec48cc34d6e03..d77b6712c548370a99e350b73ab86b170c0e17dc 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -82,8 +82,7 @@ class SoftmaxFunctor> { const int kClassDim = 1; // 2D data. Batch x C auto compute_softmax = - jit::KernelFuncs, - platform::CPUPlace>::Cache() + jit::KernelFuncs, platform::CPUPlace>::Cache() .At(in_dims[kClassDim]); compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]); } diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index e16b6f78d16ce29cc493c4c795c7fe97a4bf2550..43559940d925e6fff29f0c5c66ec1a3dc717aaf4 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -52,11 +52,6 @@ class MKLDNNActivationKernel "Wrong layout/format set for Input x tensor"); Functor functor; - - auto attrs = functor.GetAttrs(); - for (auto &attr : attrs) { - *attr.second = ctx.Attr(attr.first); - } functor(ctx); } }; @@ -76,11 +71,6 @@ class MKLDNNActivationGradKernel "is_test attribute should be set to False in training phase."); Functor functor; - - auto attrs = functor.GetAttrs(); - for (auto &attr : attrs) { - *attr.second = ctx.Attr(attr.first); - } functor(ctx); } }; @@ -106,8 +96,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx, std::vector src_tz = framework::vectorize2int(x->dims()); - auto src_format = - src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format(); + auto src_format = x->format(); const std::string key = gethash(src_tz, algorithm); const std::string key_src_data = @@ -137,10 +126,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx, if (p_fwd == nullptr) { // create mkldnn memory for input X - auto src_md = platform::MKLDNNMemDesc( - src_tz, platform::MKLDNNGetDataType(), src_format); auto src_memory = std::shared_ptr( - new memory({src_md, mkldnn_engine}, to_void_cast(x_data))); + new memory(x->get_mkldnn_prim_desc(), to_void_cast(x_data))); // save src_memory to be referred in backward path dev_ctx.SetBlob(key_src_mem, src_memory); @@ -187,8 +174,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx, pipeline.push_back(*p_fwd); stream(stream::kind::eager).submit(pipeline).wait(); - y->set_layout(DataLayout::kMKLDNN); - y->set_format(GetMKLDNNFormat(*dst_memory)); + y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc()); } template @@ -206,9 +192,6 @@ void eltwise_grad(const framework::ExecutionContext &ctx, std::vector diff_dst_tz = framework::vectorize2int(diff_y->dims()); - auto diff_y_format = - diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format(); - const std::string key = gethash(diff_dst_tz, algorithm); const std::string key_src_data = key + ctx.op().Input("Out") + "@eltwise_fwd_src_data"; @@ -220,8 +203,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx, key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem"; const std::string key_fwd_pd = key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd"; - const std::string key_with_layouts = - key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format); + const std::string key_with_layouts = key + std::to_string(*p_src_layout) + + "-" + std::to_string(diff_y->format()); const std::string key_diff_src_mem = key_with_layouts + "@eltwise_diff_src_mem"; const std::string key_diff_dst_mem = @@ -235,7 +218,7 @@ void eltwise_grad(const framework::ExecutionContext &ctx, std::static_pointer_cast(dev_ctx.GetBlob(key_src_mem)); PADDLE_ENFORCE(src_memory != nullptr, "Fail to find src_memory in device context"); - src_memory->set_data_handle(*p_src_data.get()); + src_memory->set_data_handle(*p_src_data); std::shared_ptr diff_src_memory; @@ -244,10 +227,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx, if (p_grad == nullptr) { // create mkldnn memory for input diff_y - auto diff_dst_md = platform::MKLDNNMemDesc( - diff_dst_tz, platform::MKLDNNGetDataType(), diff_y_format); auto diff_dst_memory = std::shared_ptr( - new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data))); + new memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data))); dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory); // retrieve eltwise primitive desc from device context @@ -291,8 +272,7 @@ void eltwise_grad(const framework::ExecutionContext &ctx, pipeline.push_back(*p_grad); stream(stream::kind::eager).submit(pipeline).wait(); - diff_x->set_layout(DataLayout::kMKLDNN); - diff_x->set_format(GetMKLDNNFormat(*diff_src_memory)); + diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc()); } template diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc index bddca232e6c8a2a7fde998877006e37ee6d3d0dc..04e45d4853907bb7d6b5ce362892a2183fd4b60e 100644 --- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc @@ -206,17 +206,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu; // create mkldnn memory from input x tensor - mkldnn::memory::format input_format = - platform::MKLDNNFormatForSize(src_tz.size(), x->format()); // keys for backward pass const std::string key = BatchNormMKLDNNHandler::GetHash( - src_tz, epsilon, flags, global_stats, input_format, + src_tz, epsilon, flags, global_stats, x->format(), ctx.op().Output("SavedMean")); const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; - auto user_src_md = platform::MKLDNNMemDesc( - {src_tz}, platform::MKLDNNGetDataType(), input_format); + auto user_src_md = x->get_mkldnn_prim_desc().desc(); // create primitive descriptor for batch norm forward using bn_fwd_types = bn_type_traits; @@ -230,8 +227,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine, key); - auto src_memory = - handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data)); + auto src_memory = handler.AcquireSrcMemory(x->get_mkldnn_prim_desc(), + to_void_cast(x_data)); // crate mkldnn memory for weights(scale/shift) auto scaleshift_memory = @@ -265,8 +262,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { variance_memory, false); } - y->set_layout(DataLayout::kMKLDNN); - y->set_format(platform::GetMKLDNNFormat(*dst_memory)); + y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc()); std::vector pipeline; pipeline.push_back(*batch_norm_p); @@ -336,9 +332,6 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { using bn_bwd_types = bn_type_traits; - mkldnn::memory::format dst_format = - platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format()); - mkldnn::memory::format input_format = platform::MKLDNNFormatForSize(src_tz.size(), x->format()); @@ -346,14 +339,14 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { // keys from forward pass const std::string key = BatchNormMKLDNNHandler::GetHash( - src_tz, epsilon, flags, false, input_format, + src_tz, epsilon, flags, false, x->format(), ctx.op().Input("SavedMean")); const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; // keys for primitives reuse const std::string key_with_hash = key + BatchNormMKLDNNHandler::GetHash(src_tz, epsilon, flags, false, - input_format); + x->format()); const std::string key_batch_norm_bwd_p = key_with_hash + "@batch_norm_bwd_p"; const std::string key_batch_norm_src_mem_p = @@ -373,9 +366,8 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { primitive reorder_diff_dst; bool is_diff_dst_reordered = false; - auto user_diff_dst_memory = memory( - {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine}, - to_void_cast(diff_y_data)); + auto user_diff_dst_memory = + memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data)); // MKLDNN requires a single piece of memory for scale and shift/bias data const size_t scaleshift_size = 2 * ic; @@ -459,10 +451,7 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { dev_ctx.SetBlob(key_batch_norm_diff_dst_mem_p, diff_dst_memory); // set layout/format of output tensors - diff_x->set_layout(DataLayout::kMKLDNN); - diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc() - .desc() - .data.format); + diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc()); } else { // primitives already exist UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data)); @@ -487,10 +476,7 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { } // set layout/format of output tensors - diff_x->set_layout(DataLayout::kMKLDNN); - diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc() - .desc() - .data.format); + diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc()); } // execute optional reorder and batch_norm backward primitive diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index 7ad674056f0d753d79408a11eff1aca47a84998a..54c6a71111a2cc2f9e5004922ae5d3541a9d0a70 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -47,11 +47,6 @@ static memory::primitive_desc CreateMemPrimDesc(const Tensor& input, return mem_prim_desc; } -static mkldnn::memory::format GetDstMemFormat( - const concat::primitive_desc& concat_pd) { - return (memory::format)concat_pd.dst_primitive_desc().desc().data.format; -} - static platform::CPUPlace GetCpuPlace( const paddle::framework::ExecutionContext& ctx) { auto place = ctx.GetPlace(); @@ -139,8 +134,7 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place); stream(stream::kind::eager).submit({concat}).wait(); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetDstMemFormat(concat_pd)); + output->set_mkldnn_prim_desc(concat_pd.dst_primitive_desc()); } }; } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 0ce174654e85175f0b949f860a00afafc548ed3e..14ca3e8073b9512732876e512a30968b15884495 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -96,12 +96,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; auto* output = ctx.Output("Output"); - PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && - input->format() != memory::format::format_undef, - "Wrong layout/format set for Input tensor"); - PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && - filter->format() != memory::format::format_undef, - "Wrong layout/format set for Filter tensor"); + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN); PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5, "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5, @@ -148,14 +144,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector pipeline; - auto src_format = input->format(); - mkldnn::memory::format weights_format = - GetWeightsFormat(filter->format(), g, is_conv3d); - - auto user_src_md = platform::MKLDNNMemDesc( - {src_tz}, platform::MKLDNNGetDataType(), src_format); - auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), weights_format); + // For convolution with groups we need to recreate primitive descriptor + // as Paddle tensor is not having group dims while mkldnn treats + // group as another dimensions + mkldnn::memory::primitive_desc user_weights_mpd = + filter->get_mkldnn_prim_desc(); + if (g > 1) { + mkldnn::memory::format weights_format = + GetWeightsFormat(filter->format(), g, is_conv3d); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), weights_format); + user_weights_mpd = + mkldnn::memory::primitive_desc(user_weights_md, mkldnn_engine); + } /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose @@ -165,7 +166,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto chosen_memory_format = platform::data_format_to_memory_format(data_format); - weights_format = mkldnn::memory::format::any; + mkldnn::memory::format weights_format = mkldnn::memory::format::any; // Check the format for user's special output if (chosen_memory_format != mkldnn::memory::format::any) { if (is_conv3d) { @@ -205,10 +206,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key); // create mkldnn memory from input tensors (data/weights) - auto user_src_memory_p = - handler.AcquireSrcMemory(user_src_md, to_void_cast(input_data)); + auto user_src_memory_p = handler.AcquireSrcMemory( + input->get_mkldnn_prim_desc(), to_void_cast(input_data)); auto user_weights_memory_p = handler.AcquireWeightsMemory( - user_weights_md, to_void_cast(filter_data)); + user_weights_mpd, to_void_cast(filter_data)); // create reorder primitive if the input format is not the preferred one auto src_memory_p = @@ -281,8 +282,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { pipeline.push_back(*conv_p); stream(stream::kind::eager).submit(pipeline).wait(); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory_p)); + output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc()); } void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const { const bool is_test = ctx.Attr("is_test"); @@ -947,8 +947,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { // push primitive to stream and wait until it's executed pipeline.push_back(*conv_bwd_weights_p); - filter_grad->set_layout(DataLayout::kMKLDNN); - filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p)); + auto filter_grad_mpd = diff_weights_memory_p->get_primitive_desc(); + filter_grad->set_mkldnn_prim_desc(filter_grad_mpd); } if (input_grad) { @@ -971,8 +971,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { pipeline.push_back(*conv_bwd_data_p); - input_grad->set_layout(DataLayout::kMKLDNN); - input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p)); + input_grad->set_mkldnn_prim_desc(diff_src_memory_p->get_primitive_desc()); } stream(stream::kind::eager).submit(pipeline).wait(); } @@ -990,12 +989,12 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, ::paddle::platform::CPUPlace, U8, - ops::kConvMKLDNNFP32, + ops::kConvMKLDNNINT8, ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, ::paddle::platform::CPUPlace, S8, - ops::kConvMKLDNNFP32, + ops::kConvMKLDNNINT8, ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index 317d4cebe26b81ff03c212e6328233d5152ed1b4..79a0c5c7683d677daeb4feea10deab86407f944c 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -221,8 +221,7 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { pipeline.push_back(*conv_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); + output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc()); } private: diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc index 262b7408a7f5f65c4d97120914c16f38ce5fdbe7..accc9a9d71ffccf2812d57a7516eaf7e0f83275c 100644 --- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/dequantize_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace operators { @@ -30,6 +31,18 @@ using framework::DataLayout; using mkldnn::stream; using platform::GetMKLDNNFormat; +std::string CreateKey(const paddle::framework::ExecutionContext& ctx, + const mkldnn::memory::data_type& src_dt, + const std::vector& src_tz, const float scale_data) { + std::string key; + key.reserve(platform::MKLDNNHandler::MaxKeyLength); + platform::MKLDNNHandler::AppendKey(&key, std::to_string(src_dt)); + platform::MKLDNNHandler::AppendKeyDims(&key, src_tz); + platform::MKLDNNHandler::AppendKey(&key, std::to_string(scale_data)); + platform::MKLDNNHandler::AppendKey(&key, ctx.op().Output("Output")); + return key; +} + template class DeQuantOpKernel : public framework::OpKernel { public: @@ -51,31 +64,55 @@ class DeQuantOpKernel : public framework::OpKernel { mkldnn::memory::data_type src_dt = paddle::framework::ToMKLDNNDataType(input->type()); mkldnn::memory::format src_fmt = input->format(); + std::string key = CreateKey(ctx, src_dt, src_tz, reorder_scale[0]); + const std::string key_prim = key + "@reorder_p"; + const std::string key_src_mem = key + "@src_mem"; + const std::string key_dst_mem = key + "@dst_mem"; + + std::shared_ptr src_memory; + std::shared_ptr dst_memory; + std::shared_ptr reorder_p; + reorder_p = std::static_pointer_cast(dev_ctx.GetBlob(key_prim)); + + if (reorder_p == nullptr) { + mkldnn::primitive_attr attri; + int mask = 0; + attri.set_output_scales(mask, reorder_scale); + + auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt); + auto src_pd = mkldnn::memory::primitive_desc(src_md, engine); + src_memory = + std::make_shared(src_pd, to_void_cast(input_data)); + std::shared_ptr src_memory_p = + std::shared_ptr(new primitive::at(*src_memory)); + + auto dst_md = platform::MKLDNNMemDesc({dst_tz}, memory::data_type::f32, + memory::format::nchw); + auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine); + dst_memory = std::make_shared( + dst_pd, to_void_cast(output_data)); + + auto reorder_pd = std::shared_ptr( + new reorder::primitive_desc(src_pd, dst_pd, attri)); + reorder_p = std::shared_ptr( + new reorder(*reorder_pd, *src_memory_p, *dst_memory)); + dev_ctx.SetBlob(key_prim, reorder_p); + dev_ctx.SetBlob(key_src_mem, src_memory); + dev_ctx.SetBlob(key_dst_mem, dst_memory); + } else { + src_memory = std::static_pointer_cast( + dev_ctx.GetBlob(key_src_mem)); + src_memory->set_data_handle(to_void_cast(input_data)); + + dst_memory = std::static_pointer_cast( + dev_ctx.GetBlob(key_dst_mem)); + dst_memory->set_data_handle(output->mutable_data(ctx.GetPlace())); + } - mkldnn::primitive_attr attri; - int mask = 0; - attri.set_output_scales(mask, reorder_scale); - - auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt); - auto src_pd = mkldnn::memory::primitive_desc(src_md, engine); - auto src_memory = - std::make_shared(src_pd, to_void_cast(input_data)); - std::shared_ptr src_memory_p = - std::shared_ptr(new primitive::at(*src_memory)); - - auto dst_md = platform::MKLDNNMemDesc({dst_tz}, memory::data_type::f32, - memory::format::nchw); - auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine); - auto dst_memory = mkldnn::memory(dst_pd, to_void_cast(output_data)); - - auto reorder_pd = std::shared_ptr( - new reorder::primitive_desc(src_pd, dst_pd, attri)); - auto reorder_p = std::shared_ptr( - new reorder(*reorder_pd, *src_memory_p, dst_memory)); pipeline.push_back(*reorder_p); stream(stream::kind::eager).submit(pipeline).wait(); - output->set_format(GetMKLDNNFormat(dst_memory)); + output->set_format(GetMKLDNNFormat(*dst_memory)); } }; diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc index 76b00b396c1349eff5db1059268e7cf280a8fc64..d01e8dbf4ce0c92bb81fc76df68d5424f9da0717 100644 --- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc @@ -42,8 +42,12 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel { // The format of output is set as the mkldnn's format // TODO(@mozga-intel) The format of matrix sets inside the another layers. - tensor->set_layout(DataLayout::kMKLDNN); - tensor->set_format(mkldnn::memory::format::oihw); + // TODO(jczaja): Remove this hack after checking performance on block layout + + auto tensor_mem_pd = paddle::platform::create_prim_desc_from_dims( + paddle::framework::vectorize2int(tensor->dims()), + mkldnn::memory::format::oihw); + tensor->set_mkldnn_prim_desc(tensor_mem_pd); } }; } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc index 097ba01d401dbc7969e30f576cac2567c874ed99..4ff27ab12280b56abdf72056fe69ec713f2f2f46 100644 --- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc @@ -81,10 +81,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto e_mid = framework::EigenTensor::From(*mid); e_mid = e_mid.constant(k); - auto dims = paddle::framework::vectorize2int(x->dims()); - - auto src_md = paddle::platform::MKLDNNMemDesc( - dims, mkldnn::memory::data_type::f32, x->format()); + auto src_md = x->get_mkldnn_prim_desc().desc(); auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward, mkldnn::lrn_across_channels, @@ -94,7 +91,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { beta, k}; - auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; + auto src_memory_pd = x->get_mkldnn_prim_desc(); if (!is_test) { const std::string key = ctx.op().Output("Out"); @@ -111,16 +108,15 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { src_memory->set_data_handle( static_cast(const_cast(input_data))); - auto dst_memory = mkldnn::memory(forward_pd->dst_primitive_desc(), - static_cast(output_data)); + auto dst_memory_pd = forward_pd->dst_primitive_desc(); + auto dst_memory = + mkldnn::memory(dst_memory_pd, static_cast(output_data)); auto workspace_memory = insert_to_context( key_workspace_memory, dev_ctx, forward_pd->workspace_primitive_desc()); run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory); - - out->set_layout(framework::DataLayout::kMKLDNN); - out->set_format(platform::GetMKLDNNFormat(dst_memory)); + out->set_mkldnn_prim_desc(dst_memory_pd); } else { auto forward_pd = mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine}; @@ -128,13 +124,12 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { src_memory_pd, static_cast(const_cast(input_data))}; auto workspace_memory = mkldnn::memory{forward_pd.workspace_primitive_desc()}; + auto dst_memory_pd = forward_pd.dst_primitive_desc(); auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(), static_cast(output_data)); run_primitive(forward_pd, src_memory, workspace_memory, dst_memory); - - out->set_layout(framework::DataLayout::kMKLDNN); - out->set_format(platform::GetMKLDNNFormat(dst_memory)); + out->set_mkldnn_prim_desc(dst_memory_pd); } } }; diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index f4bad7b712b2b078ed68f0a3d0e751d9ae2d6191..5d8e81921157cbdf35f7016741ab45c362b7261f 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/operators/pool_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace operators { @@ -29,23 +30,23 @@ using mkldnn::stream; using platform::to_void_cast; // Generate keys for storing/retriving primitives for this operator -// TODO(jczaja): Make hashing function more optimial -static std::string gethash(const memory::dims& input_dims, - const std::string& pooling_type, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const memory::data_type& dt, - const std::string& suffix) { - auto dims2str = [](const memory::dims& operand_dims) { - std::string dstr = ""; - for (size_t i = 0; i < operand_dims.size(); ++i) { - dstr += std::to_string(operand_dims[i]) + "-"; - } - return dstr; - }; - return dims2str(input_dims) + dims2str(ksize) + dims2str(strides) + - dims2str(paddings) + std::to_string(dt) + pooling_type + suffix; +std::string CreateKey(const paddle::framework::ExecutionContext& ctx, + const memory::dims& input_dims, + const std::string& pooling_type, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const memory::data_type& dt, const std::string& suffix) { + std::string key; + key.reserve(platform::MKLDNNHandler::MaxKeyLength); + platform::MKLDNNHandler::AppendKeyDims(&key, input_dims); + platform::MKLDNNHandler::AppendKey(&key, pooling_type); + platform::MKLDNNHandler::AppendKeyVec(&key, ksize); + platform::MKLDNNHandler::AppendKeyVec(&key, strides); + platform::MKLDNNHandler::AppendKeyVec(&key, paddings); + platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt)); + platform::MKLDNNHandler::AppendKey(&key, suffix); + return key; } static inline int ComputeCeiledOutput(int input_size, int kernel_size, @@ -114,8 +115,8 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { mkldnn::memory::data_type dt = paddle::framework::ToMKLDNNDataType(input->type()); - const std::string key = gethash(src_tz, pooling_type, ksize, strides, - paddings, dt, ctx.op().Output("Out")); + const std::string key = CreateKey(ctx, src_tz, pooling_type, ksize, strides, + paddings, dt, ctx.op().Output("Out")); const std::string key_pool_p = key + "@pool_p"; const std::string key_pool_pd = key + "@pool_pd"; const std::string key_pool_src_mem_p = key + "@pool_src_mem_p"; @@ -198,7 +199,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { } // push primitive to stream and wait until it's executed - std::vector pipeline{*(pool_p.get())}; + std::vector pipeline{*pool_p}; stream(stream::kind::eager).submit(pipeline).wait(); output->set_layout(DataLayout::kMKLDNN); @@ -294,8 +295,8 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { // Get an unique name from "argument" name of "Out" variable // This name will be used as key when referring info from device context const std::string key = - gethash(diff_src_tz, pooling_type, ksize, strides, paddings, - memory::data_type::f32, ctx.op().Input("Out")); + CreateKey(ctx, diff_src_tz, pooling_type, ksize, strides, paddings, + memory::data_type::f32, ctx.op().Input("Out")); const std::string key_pool_bwd_p = key + "@pool_bwd_p"; const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p"; const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p"; @@ -367,8 +368,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { dev_ctx.SetBlob(key_pool_diff_dst_mem_p, diff_dst_memory); pool_bwd_p = std::make_shared( - pool_bwd_pd, *(diff_dst_memory.get()), *workspace_memory, - *(diff_src_memory)); + pool_bwd_pd, *diff_dst_memory, *workspace_memory, *diff_src_memory); dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p); } else { @@ -404,7 +404,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { if (is_diff_dst_reordered) { pipeline.push_back(reorder_diff_dst); } - pipeline.push_back(*(pool_bwd_p.get())); + pipeline.push_back(*pool_bwd_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); in_x_grad->set_layout(DataLayout::kMKLDNN); diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc index 0638e42873376bcec6e4de61494da46d1f0073d1..04cd60be964a3967a45e73122324c4b3fdf0b3d0 100644 --- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc @@ -30,6 +30,18 @@ using framework::DataLayout; using mkldnn::stream; using platform::GetMKLDNNFormat; +std::string CreateKey(const paddle::framework::ExecutionContext& ctx, + const std::vector& src_tz, const float scale_data, + const bool is_negative) { + std::string key; + key.reserve(platform::MKLDNNHandler::MaxKeyLength); + platform::MKLDNNHandler::AppendKeyDims(&key, src_tz); + platform::MKLDNNHandler::AppendKey(&key, std::to_string(scale_data)); + platform::MKLDNNHandler::AppendKey(&key, std::to_string(is_negative)); + platform::MKLDNNHandler::AppendKey(&key, ctx.op().Output("Output")); + return key; +} + template class QuantOpKernel : public framework::OpKernel { public: @@ -47,32 +59,61 @@ class QuantOpKernel : public framework::OpKernel { const T* input_data = input->data(); - mkldnn::primitive_attr attri; - int mask = 0; - attri.set_output_scales(mask, {scale_data}); - - auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32, - input->format()); - auto src_pd = mkldnn::memory::primitive_desc(src_md, engine); - auto src_memory = - std::make_shared(src_pd, to_void_cast(input_data)); - std::shared_ptr src_memory_p = - std::shared_ptr(new primitive::at(*src_memory)); - bool is_negative = ctx.Attr("is_negative_input"); - std::shared_ptr dst_pd; + std::string key = CreateKey(ctx, src_tz, scale_data, is_negative); + const std::string key_prim = key + "@reorder_p"; + const std::string key_src_mem = key + "@src_mem"; + const std::string key_dst_mem = key + "@dst_mem"; + + std::shared_ptr src_memory; std::shared_ptr dst_memory; - if (is_negative) { - platform::ConvMKLDNNHandler::SetDstMemory( - ctx, output, dst_tz, engine, dst_pd, dst_memory); + std::shared_ptr reorder_p; + reorder_p = std::static_pointer_cast(dev_ctx.GetBlob(key_prim)); + + if (reorder_p == nullptr) { + mkldnn::primitive_attr attri; + int mask = 0; + attri.set_output_scales(mask, {scale_data}); + + auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32, + input->format()); + auto src_pd = mkldnn::memory::primitive_desc(src_md, engine); + src_memory = + std::make_shared(src_pd, to_void_cast(input_data)); + std::shared_ptr src_memory_p = + std::shared_ptr(new primitive::at(*src_memory)); + + std::shared_ptr dst_pd; + if (is_negative) { + platform::ConvMKLDNNHandler::SetDstMemory( + ctx, output, dst_tz, engine, dst_pd, dst_memory); + } else { + platform::ConvMKLDNNHandler::SetDstMemory( + ctx, output, dst_tz, engine, dst_pd, dst_memory); + } + auto reorder_pd = std::shared_ptr( + new reorder::primitive_desc(src_pd, *dst_pd, attri)); + reorder_p = std::shared_ptr( + new reorder(*reorder_pd, *src_memory_p, *dst_memory)); + + dev_ctx.SetBlob(key_prim, reorder_p); + dev_ctx.SetBlob(key_src_mem, src_memory); + dev_ctx.SetBlob(key_dst_mem, dst_memory); } else { - platform::ConvMKLDNNHandler::SetDstMemory( - ctx, output, dst_tz, engine, dst_pd, dst_memory); + src_memory = std::static_pointer_cast( + dev_ctx.GetBlob(key_src_mem)); + src_memory->set_data_handle(to_void_cast(input_data)); + + dst_memory = std::static_pointer_cast( + dev_ctx.GetBlob(key_dst_mem)); + auto place = ctx.GetPlace(); + if (is_negative) { + dst_memory->set_data_handle(output->mutable_data(place)); + } else { + dst_memory->set_data_handle(output->mutable_data(place)); + } } - auto reorder_pd = std::shared_ptr( - new reorder::primitive_desc(src_pd, *dst_pd, attri)); - auto reorder_p = std::shared_ptr( - new reorder(*reorder_pd, *src_memory_p, *dst_memory)); + pipeline.push_back(*reorder_p); stream(stream::kind::eager).submit(pipeline).wait(); output->set_layout(DataLayout::kMKLDNN); diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..44e8281424ba6937dad2c2dee1db4dee96b3b2eb --- /dev/null +++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc @@ -0,0 +1,94 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "mkldnn.hpp" +#include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/requantize_op.h" +#include "paddle/fluid/platform/mkldnn_helper.h" + +namespace paddle { +namespace operators { + +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::reorder; +using platform::to_void_cast; +using Tensor = framework::Tensor; +using framework::DataLayout; +using mkldnn::stream; +using platform::GetMKLDNNFormat; + +template +class ReQuantOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto scale_in = ctx.Attr("Scale_in"); + auto scale_out = ctx.Attr("Scale_out"); + auto* output = ctx.Output("Output"); + auto& dev_ctx = + ctx.template device_context(); + const auto& engine = dev_ctx.GetEngine(); + + std::vector pipeline; + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + mkldnn::memory::data_type src_dt = + paddle::framework::ToMKLDNNDataType(input->type()); + mkldnn::memory::data_type dst_dt = src_dt; // TODO(Xiaoli) support + // requantize from different + // data type (e.g., s8 to u8) + mkldnn::memory::format src_fmt = memory::format::nhwc; + mkldnn::memory::format dst_fmt = memory::format::nhwc; + + const T* input_data = input->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + float scale_shift = scale_out / scale_in; + + mkldnn::primitive_attr attri; + int mask = 0; + attri.set_output_scales(mask, {scale_shift}); + + auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt); + auto src_pd = mkldnn::memory::primitive_desc(src_md, engine); + auto src_memory = + std::make_shared(src_pd, to_void_cast(input_data)); + std::shared_ptr src_memory_p = + std::shared_ptr(new primitive::at(*src_memory)); + + auto dst_md = platform::MKLDNNMemDesc({dst_tz}, dst_dt, dst_fmt); + auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine); + auto dst_memory = mkldnn::memory(dst_pd, to_void_cast(output_data)); + + auto reorder_pd = std::shared_ptr( + new reorder::primitive_desc(src_pd, dst_pd, attri)); + + auto reorder_p = std::shared_ptr( + new reorder(*reorder_pd, *src_memory_p, dst_memory)); + pipeline.push_back(*reorder_p); + stream(stream::kind::eager).submit(pipeline).wait(); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(dst_memory)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(requantize, MKLDNN, ::paddle::platform::CPUPlace, + ops::ReQuantOpKernel, ops::ReQuantOpKernel); diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index d2b149535426d097fea4b8fffa9efe82bd6edc64..0ce552219458859e147ba207c94270bf84a1fe75 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -66,8 +66,7 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler { "Fail to find softmax primitive in device context"); if (softmax_p == nullptr) { softmax_p = std::make_shared( - *(softmax_pd_.get()), - *(static_cast(src_memory_p.get())), + *softmax_pd_, *(static_cast(src_memory_p.get())), *(static_cast(dst_memory_p.get()))); dev_ctx_.SetBlob(prim_key, softmax_p); } else { @@ -88,8 +87,8 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler { "Fail to find softmax backward primitive in device context"); if (softmax_bwd_p == nullptr) { softmax_bwd_p = std::make_shared( - *softmax_bwd_pd_, *(dst_memory_p.get()), *(diff_dst_memory_p.get()), - *(diff_src_memory_p.get())); + *softmax_bwd_pd_, *dst_memory_p, *diff_dst_memory_p, + *diff_src_memory_p); dev_ctx_.SetBlob(prim_key, softmax_bwd_p); } else { is_reusing_ = true; @@ -159,6 +158,14 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { auto softmax_p = handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p); + // We cannot use softmax_dst_memory_p to get prim desc as + // it contains flattened dims (2D) while output tensor can + // have 2,3,4+ dims + auto output_mem_pd = paddle::platform::create_prim_desc_from_dims( + paddle::framework::vectorize2int(output->dims()), + mkldnn::memory::format::blocked); + output->set_mkldnn_prim_desc(output_mem_pd); + std::vector pipeline{ *(static_cast(softmax_p.get()))}; stream(stream::kind::eager).submit(pipeline).wait(); diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index c39f94637a1abb5bfce9a5428419282f2b870c91..aef5b7d4311adfedb3db157f17506c3a2c76fbf6 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -79,15 +79,6 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { memory::format input_format = input0.format(); - if (src_tz.size() == 1 && (input_format == memory::format::nchw || - input_format == memory::format::nhwc)) { - input_format = memory::format::x; - } - if (src_tz.size() == 2 && (input_format == memory::format::nchw || - input_format == memory::format::nhwc)) { - input_format = memory::format::nc; - } - for (int i = 0; i < N; i++) { PADDLE_ENFORCE(in_vars[i]->IsType(), "all inputs must be all LoDTensors"); @@ -115,12 +106,12 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { memory::desc(dst_tz, memory::data_type::f32, memory::format::any); auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd); - + auto dst_mem_pd = sum_pd.dst_primitive_desc(); std::shared_ptr dst_mem; if (in_place) { - dst_mem.reset(new memory(sum_pd.dst_primitive_desc())); + dst_mem.reset(new memory(dst_mem_pd)); } else { - dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data)); + dst_mem.reset(new memory(dst_mem_pd, output_data)); } std::vector inputs; for (size_t i = 0; i < srcs_mem.size(); ++i) { @@ -145,107 +136,11 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { if (in_place) pipeline.push_back(reorder_prim); stream(stream::kind::eager).submit(pipeline).wait(); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(output_format); - } else if (out_var->IsType()) { - // TODO(@mozga-intel) Add MKLDNN SelectedRows support - std::unique_ptr in0; - if (in_place) { - // If is in_place, we store the input[0] to in0 - auto& in_sel0 = in_vars[0]->Get(); - auto& rows = in_sel0.rows(); - in0.reset(new framework::SelectedRows(rows, in_sel0.height())); - in0->mutable_value()->ShareDataWith(in_sel0.value()); - } - - auto get_selected_row = [&](size_t i) -> const SelectedRows& { - if (i == 0 && in0) { - return *in0.get(); - } else { - return in_vars[i]->Get(); - } - }; - auto* out = ctx.Output("Out"); - out->mutable_rows()->clear(); - auto* out_value = out->mutable_value(); - - // Runtime InferShape - size_t first_dim = 0; - for (int i = 0; i < N; i++) { - auto& sel_row = get_selected_row(i); - first_dim += sel_row.rows().size(); - } - - std::vector in_dim; - for (int i = 0; i < N; i++) { - auto& sel_row = get_selected_row(i); - if (sel_row.rows().size() > 0) { - in_dim = framework::vectorize(sel_row.value().dims()); - break; - } - } - - if (in_dim.empty()) { - VLOG(3) << "WARNING: all the inputs are empty"; - in_dim = framework::vectorize(get_selected_row(N - 1).value().dims()); - } else { - in_dim[0] = static_cast(first_dim); - } - - in_dim[0] = static_cast(first_dim); - - out_value->Resize(framework::make_ddim(in_dim)); - - out_value->mutable_data(ctx.GetPlace()); - - // if all the input sparse vars are empty, no need to - // merge these vars. - if (first_dim == 0UL) { - return; - } - - math::SelectedRowsAddTo functor; - int64_t offset = 0; - for (int i = 0; i < N; i++) { - auto& sel_row = get_selected_row(i); - if (sel_row.rows().size() == 0) { - continue; - } - PADDLE_ENFORCE_EQ(out->height(), sel_row.height()); - functor(ctx.template device_context(), sel_row, - offset, out); - offset += sel_row.value().numel(); - } - } else if (out_var->IsType()) { - // TODO(@mozga-intel) Add MKLDNN LoDTensorArray support - auto& out_array = *out_var->GetMutable(); - for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) { - PADDLE_ENFORCE(in_vars[i]->IsType(), - "Only support all inputs are TensorArray"); - auto& in_array = in_vars[i]->Get(); - - for (size_t i = 0; i < in_array.size(); ++i) { - if (in_array[i].numel() != 0) { - if (i >= out_array.size()) { - out_array.resize(i + 1); - } - if (out_array[i].numel() == 0) { - framework::TensorCopy(in_array[i], in_array[i].place(), - ctx.device_context(), &out_array[i]); - out_array[i].set_lod(in_array[i].lod()); - } else { - PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod()); - auto in = EigenVector::Flatten(in_array[i]); - auto result = EigenVector::Flatten(out_array[i]); - result.device(*ctx.template device_context() - .eigen_device()) = result + in; - } - } - } - } - } else { - PADDLE_THROW("Unexpected branch, output variable type is %s", - framework::ToTypeName(out_var->Type())); + output->set_mkldnn_prim_desc(dst_mem_pd); + } else { // Fallback to naive version + // TODO(@mozga-intel) Add MKLDNN SelectedRows & LoDTensorArray support + SumKernel reference_kernel; + reference_kernel.Compute(ctx); } } }; diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc index e6df7028f540d0928e2bb0763bd4cfef12059665..e41bfb80dfc0452955f7978f74ccfea184886b69 100644 --- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc @@ -52,7 +52,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { mkldnn_engine, key); auto transpose_src_memory_p = handler.AcquireSrcMemory( - input->format(), platform::to_void_cast(input_data)); + input->get_mkldnn_prim_desc(), platform::to_void_cast(input_data)); auto transpose_dst_memory_p = handler.AcquireDstMemory(output, ctx.GetPlace()); auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, @@ -61,6 +61,15 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector pipeline; pipeline.push_back(*transpose_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + + // Transpose did change logical dimensions of Tensor, but reorder does not. + // Reorder does change only physical layout eg. format , strides + // so we need to create new primitive descriptor with changed logical layout + // so it match output shape + auto output_mem_pd = paddle::platform::create_prim_desc_from_dims( + paddle::framework::vectorize2int(output->dims()), + mkldnn::memory::format::blocked); + output->set_mkldnn_prim_desc(output_mem_pd); } }; @@ -102,8 +111,9 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx, mkldnn_engine, key); - auto transpose_src_memory_p = handler.AcquireSrcMemory( - out_grad->format(), platform::to_void_cast(out_grad_data)); + auto transpose_src_memory_p = + handler.AcquireSrcMemory(out_grad->get_mkldnn_prim_desc(), + platform::to_void_cast(out_grad_data)); auto transpose_dst_memory_p = handler.AcquireDstMemory(x_grad, ctx.GetPlace()); auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, @@ -112,6 +122,15 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { std::vector pipeline; pipeline.push_back(*transpose_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + + // Transpose did change logical dimensions of Tensor, but reorder does not. + // Reorder does change only physical layout eg. format , strides + // so we need to create new primitive descriptor with changed logical layout + // so it match output shape + auto x_grad_mem_pd = paddle::platform::create_prim_desc_from_dims( + paddle::framework::vectorize2int(x_grad->dims()), + mkldnn::memory::format::blocked); + x_grad->set_mkldnn_prim_desc(x_grad_mem_pd); } }; diff --git a/paddle/fluid/operators/ngraph/CMakeLists.txt b/paddle/fluid/operators/ngraph/CMakeLists.txt index 6b256ef02666c21ec1db3f6922b56bb23363b4a0..7559d29ce233dfcebf8b3118b4c700c35fe15d32 100644 --- a/paddle/fluid/operators/ngraph/CMakeLists.txt +++ b/paddle/fluid/operators/ngraph/CMakeLists.txt @@ -2,4 +2,5 @@ if(WITH_NGRAPH) cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto) op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context) + add_subdirectory(ops) endif() diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc index 08d72a5b3978097f4d3dca2e38bef2c3d89cfdc8..dafc31b546e3ca6d8dc8d5634dd51cff9fe5bfb7 100644 --- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc +++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc @@ -14,52 +14,27 @@ limitations under the License. */ #include #include +#include #include #include "ngraph/ngraph.hpp" #include "paddle/fluid/operators/ngraph/ngraph_bridge.h" #include "paddle/fluid/operators/ngraph/ngraph_ops.h" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { namespace operators { -namespace NG_OPS = paddle::operators::ngraphs; -std::map&, - std::shared_ptr>>)>> - NgraphBridge::NG_NODE_MAP = { - {"accuracy", NG_OPS::BuildAccuracyNode}, - {"conv2d", NG_OPS::BuildConv2dNode}, - {"conv2d_grad", NG_OPS::BuildConv2dGradNode}, - {"batch_norm", NG_OPS::BuildBatchNormNode}, - {"batch_norm_grad", NG_OPS::BuildBatchNormGradNode}, - {"elementwise_add", NG_OPS::BuildElementwiseAddNode}, - {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode}, - {"fill_constant", NG_OPS::BuildFillConstantNode}, - {"mean", NG_OPS::BuildMeanNode}, - {"mean_grad", NG_OPS::BuildMeanGradNode}, - {"mul", NG_OPS::BuildMulNode}, - {"mul_grad", NG_OPS::BuildMulGradNode}, - {"pool2d", NG_OPS::BuildPool2dNode}, - {"pool2d_grad", NG_OPS::BuildPool2dGradNode}, - {"softmax", NG_OPS::BuildSoftmaxNode}, - {"softmax_grad", NG_OPS::BuildSoftmaxGradNode}, - {"scale", NG_OPS::BuildScaleNode}, - {"sigmoid", NG_OPS::BuildUnaryNode}, - {"sum", NG_OPS::BuildSumNode}, - {"relu", NG_OPS::BuildUnaryNode}, - {"relu_grad", NG_OPS::BuildReluGradNode}, - {"tanh", NG_OPS::BuildUnaryNode}, - {"tanh_grad", NG_OPS::BuildTanhGradNode}, - {"top_k", NG_OPS::BuildTopKNode}}; +bool NgraphBridge::isRegister(const std::string& str) { + return ops::NgraphSingleton::Lookup(str); +} void NgraphBridge::BuildNgNode( const std::shared_ptr& op) { auto& op_type = op->Type(); - NG_NODE_MAP[op_type](op, ngb_node_map_); + ops::NgraphSingleton::BuildNode(ngb_node_map_, op, op_type); } } // namespace operators diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.h b/paddle/fluid/operators/ngraph/ngraph_bridge.h index c57988f8f6322e76678c572aa21ff5b17b9e3c22..b609c284959238689eaf35c87d1bc4e4330b5c1f 100644 --- a/paddle/fluid/operators/ngraph/ngraph_bridge.h +++ b/paddle/fluid/operators/ngraph/ngraph_bridge.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include #include @@ -28,13 +29,6 @@ namespace operators { class NgraphBridge { public: - static std::map< - std::string, - std::function&, - std::shared_ptr>>)>> - NG_NODE_MAP; - explicit NgraphBridge( std::shared_ptr< std::unordered_map>> @@ -43,6 +37,8 @@ class NgraphBridge { void BuildNgNode(const std::shared_ptr& op); + static bool isRegister(const std::string& str); + private: std::shared_ptr< std::unordered_map>> diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc index bec4b514a218715134d2366dd7efd7cf5b377b68..cd32200e925193b393f4531b87ed6b1e4291109d 100644 --- a/paddle/fluid/operators/ngraph/ngraph_engine.cc +++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc @@ -16,7 +16,10 @@ limitations under the License. */ #include #include +#include #include +#include +#include #include #include "paddle/fluid/framework/block_desc.h" @@ -26,7 +29,6 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/operators/ngraph/ngraph_bridge.h" #include "paddle/fluid/operators/ngraph/ngraph_engine.h" @@ -39,44 +41,75 @@ static ngraph::Shape Ddim2Shape(const framework::DDim& dims) { for (int i = 0; i < dims.size(); ++i) { int k = dims[i]; k = k == 0 ? 1 : k; - sp.push_back(k); + sp.emplace_back(k); } return sp; } +static framework::DDim Shape2Ddim(const ngraph::Shape& shape) { + std::vector dims; + for (size_t i = 0; i < shape.size(); ++i) { + int64_t k = shape[i]; + dims.emplace_back(k); + } + return framework::make_ddim(dims); +} + static std::map pd2ng_type_map = { {framework::proto::VarType::FP32, ngraph::element::f32}, {framework::proto::VarType::FP64, ngraph::element::f64}, {framework::proto::VarType::INT32, ngraph::element::i32}, {framework::proto::VarType::INT64, ngraph::element::i64}, - {framework::proto::VarType::BOOL, ngraph::element::boolean}, -}; - -std::unordered_map> - NgraphEngine::func_cache_ = {}; + {framework::proto::VarType::BOOL, ngraph::element::boolean}}; + +static std::map + ng2pd_type_map = { + {ngraph::element::f32, framework::proto::VarType::FP32}, + {ngraph::element::f64, framework::proto::VarType::FP64}, + {ngraph::element::i32, framework::proto::VarType::INT32}, + {ngraph::element::i64, framework::proto::VarType::INT64}, + {ngraph::element::boolean, framework::proto::VarType::BOOL}}; + +std::vector NgraphEngine::feed_vars = {}; +std::vector NgraphEngine::fetch_vars = {}; +framework::Variable* NgraphEngine::pre_var_ptr = nullptr; +const framework::BlockDesc* NgraphEngine::p_bdesc = nullptr; + +std::unordered_map NgraphEngine::engine_cache = {}; +std::unordered_map>> + NgraphEngine::t_in_cache_ = {}; std::shared_ptr NgraphEngine::backend_ = ngraph::runtime::Backend::create("CPU"); static std::vector> NgraphOpIntervals( - framework::BlockDesc* block) { + std::vector>* ops) { + NgraphEngine::feed_vars.clear(); + NgraphEngine::fetch_vars.clear(); std::vector> intervals; - auto ops = block->AllOps(); - int size = ops.size(); + + int size = ops->size(); int left = 0; - while (left < size && ops.at(left)->Type() != framework::kFeedOpType) { + while (left < size && ops->at(left)->Type() != framework::kFeedOpType) { ++left; } if (left == size) { return intervals; } - while (left < size && ops.at(left)->Type() == framework::kFeedOpType) { + + while (left < size && ops->at(left)->Type() == framework::kFeedOpType) { + for (auto& var_name_item : ops->at(left)->Outputs()) { + for (auto& var_name : var_name_item.second) { + NgraphEngine::feed_vars.emplace_back(var_name); + } + } ++left; } int right = left; - while (right < size && ops.at(right)->Type() != framework::kFetchOpType) { + while (right < size && ops->at(right)->Type() != framework::kFetchOpType) { ++right; } if (right == size) { @@ -84,87 +117,124 @@ static std::vector> NgraphOpIntervals( } if (left >= right) return intervals; + int index = right; + while (index < size && ops->at(index)->Type() == framework::kFetchOpType) { + for (auto& var_name_item : ops->at(index)->Inputs()) { + for (auto& var_name : var_name_item.second) { + NgraphEngine::fetch_vars.emplace_back(var_name); + } + } + ++index; + } + // (left, right - 1) represents indices between feed and fetch int pivot = left; while (pivot < right) { - auto op_type = ops.at(pivot)->Type(); - if (NgraphBridge::NG_NODE_MAP.find(op_type) == - NgraphBridge::NG_NODE_MAP.end()) { + auto op_type = ops->at(pivot)->Type(); + if (NgraphBridge::isRegister(op_type)) { ++pivot; } else { int start = pivot, end = start; while (pivot < right && - (NgraphBridge::NG_NODE_MAP.find(ops.at(pivot)->Type()) != - NgraphBridge::NG_NODE_MAP.end())) { + (!NgraphBridge::isRegister(ops->at(pivot)->Type()))) { ++pivot; ++end; } std::vector interval = {start, end}; - intervals.push_back(interval); + intervals.emplace_back(interval); } } // end while return intervals; } -static void SubstituteNgraphOp(framework::BlockDesc* block, - std::string block_str, - std::vector interval) { - framework::ProgramDesc program; - block->RemoveOp(interval.at(0), interval.at(1)); - auto* ng_op = block->InsertOp(interval.at(0)); - ng_op->SetType("ngraph_engine"); - ng_op->SetAttr("interval", interval); - ng_op->SetAttr("graph", block_str); +static void SubstituteNgraphOp( + std::vector>* ops, + std::string engine_key, std::string block_str, std::vector interval) { + framework::OpDesc ng_op_desc(nullptr); + ng_op_desc.SetType("ngraph_engine"); + ng_op_desc.SetAttr("interval", interval); + ng_op_desc.SetAttr("engine_key", engine_key); + ng_op_desc.SetAttr("graph", block_str); + + ops->erase(ops->begin() + interval[0], ops->begin() + interval[1]); + ops->insert(ops->begin() + interval[0], + framework::OpRegistry::CreateOp(ng_op_desc)); } -// TODO(baojun-nervana): Move EnableNgraph to compile time per PR #15089 -void NgraphEngine::EnableNgraph(const framework::ProgramDesc& program) { -#ifdef PADDLE_WITH_NGRAPH - VLOG(4) << "use_ngraph=True"; - for (size_t bid = 0; bid < program.Size(); ++bid) { - // TODO(baojun-nervana): Remove the const_cast - auto* block = - const_cast(program).MutableBlock(bid); - std::string block_str = block->Proto()->SerializeAsString(); - auto intervals = NgraphOpIntervals(block); - for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { - SubstituteNgraphOp(block, block_str, *it); - } +std::string SerializedBlock(const std::vector& op_descs) { + framework::proto::BlockDesc block_proto; + framework::BlockDesc block_desc(nullptr, &block_proto); + block_desc.Proto()->set_parent_idx(-1); + block_desc.Proto()->set_idx(0); + + for (auto* op_desc : op_descs) { + auto* op = block_desc.AppendOp(); + *op->Proto() = *op_desc->Proto(); + } + return block_desc.Proto()->SerializeAsString(); +} + +std::string GenerateEngineKey(const framework::BlockDesc& bdesc) { + framework::proto::BlockDesc block_proto; + framework::BlockDesc block_desc(nullptr, &block_proto); + block_desc.Proto()->set_parent_idx(-1); + block_desc.Proto()->set_idx(0); + + for (auto& op_desc : bdesc.AllOps()) { + auto* op = block_desc.AppendOp(); + *op->Proto() = *op_desc->Proto(); + } + auto engine_key = std::to_string( + std::hash()(block_desc.Proto()->SerializeAsString())); + return engine_key; +} + +std::string GenerateEngineKey(const std::vector& engine_inputs, + const std::vector& engine_outputs, + int size) { + std::string engine_hash_key = ""; + for (auto name : engine_inputs) { + engine_hash_key += name; + } + for (auto name : engine_outputs) { + engine_hash_key += name; + } + engine_hash_key += std::to_string(size); + auto engine_key = std::to_string(std::hash()(engine_hash_key)); + return engine_key; +} + +void NgraphEngine::FuseNgraphOps( + const framework::BlockDesc& block_desc, + std::vector>* ops) { + NgraphEngine::p_bdesc = &block_desc; + auto intervals = NgraphOpIntervals(ops); + std::string engine_key = + GenerateEngineKey(feed_vars, fetch_vars, ops->size()); + for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { + SubstituteNgraphOp(ops, engine_key, "", *it); } -#else - LOG(WARNING) - << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option"; -#endif } NgraphEngine::NgraphEngine(const framework::Scope& scope, const platform::Place& place, - const std::string& serialized_graph, - const std::vector& interval) + const framework::ExecutionContext& ctx) : scope_(scope), place_(place) { + std::string serialized_graph = ctx.Attr("graph"); + auto interval = ctx.Attr>("interval"); + std::string engine_key = ctx.Attr("engine_key"); + var_in_node_map_ = std::make_shared< std::unordered_map>>(); var_node_map_ = std::make_shared< std::unordered_map>>(); - func_cache_key_ = std::to_string(interval[0]) + std::to_string(interval[1]) + - serialized_graph; - - framework::proto::BlockDesc bdesc; - bdesc.ParseFromString(serialized_graph); - framework::BlockDesc block(nullptr, &bdesc); - - Prepare(block, interval); - - BuildNgIO(); - - GetNgFunction(); + GetNgFunction(engine_key, interval); } -void NgraphEngine::Prepare(const framework::BlockDesc& block, - const std::vector& interval) { - for (auto& var : block.AllVars()) { +void NgraphEngine::Prepare(const std::vector& interval) { + for (auto& var : p_bdesc->AllVars()) { if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS || var->GetType() == framework::proto::VarType::LOD_TENSOR || var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) { @@ -191,108 +261,57 @@ void NgraphEngine::Prepare(const framework::BlockDesc& block, } } - auto ops_desc = block.AllOps(); - int idx = interval[0]; - while (idx < interval[1]) { - auto op_desc = ops_desc.at(idx); - auto op = framework::OpRegistry::CreateOp(*op_desc); - fused_ops_.push_back(std::move(op)); - ++idx; - } - - while (ops_desc.at(idx)->Type() != framework::kFetchOpType) { - auto op_desc = ops_desc.at(idx); - for (auto& var_name_item : op_desc->Inputs()) { - for (auto& var_name : var_name_item.second) { - post_op_inputs_.insert(var_name); - } - } - ++idx; - } - - while (idx < static_cast(ops_desc.size()) && - ops_desc.at(idx)->Type() == framework::kFetchOpType) { - std::string fetch_target_name = ops_desc.at(idx)->Input("X")[0]; - fetches_.insert(fetch_target_name); - ++idx; - } - - if (ops_desc.at(interval.at(0) - 1)->Type() == framework::kFeedOpType && - ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) { - ng_op_state_ = OpState::FULL; + std::vector ops_desc; + for (auto op_desc : p_bdesc->AllOps()) { + ops_desc.emplace_back(op_desc); } - for (auto* op_desc : ops_desc) { + for (auto op_desc : ops_desc) { if (op_desc->Type().find("_grad") != std::string::npos) { - ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TRAIN - : OpState::PARTIAL_TRAIN; + this->is_test_ = false; break; } } - if (ng_op_state_ != OpState::FULL_TRAIN && - ng_op_state_ != OpState::PARTIAL_TRAIN) { - ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TEST - : OpState::PARTIAL_TEST; + if (interval[0] > 0 && + ops_desc.at(interval[0] - 1)->Type() == framework::kFeedOpType && + interval[1] < static_cast(ops_desc.size()) && + ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) { + this->op_state_ = OpState::FULL; } -} -void NgraphEngine::GetNgInputShape( - std::shared_ptr op) { - framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); - op->RuntimeInferShape(scope_, place_, ctx); - for (auto& var_name_item : op->Inputs()) { - for (auto& var_name : var_name_item.second) { - auto* var = scope_.FindVar(var_name); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - auto sp = Ddim2Shape(tensor_pd->dims()); - if (std::find(var_in_.begin(), var_in_.end(), var_name) != - var_in_.end()) { - if (var_node_map_->find(var_name) == var_node_map_->end()) { - // auto ng_type = pd2ng_type_map.at(GetDataTypeOfVar(var)); - auto ng_type = var_type_map_.at(var_name); - auto prm = - std::make_shared(ng_type, sp, true); - (*var_node_map_)[var_name] = prm; - (*var_in_node_map_)[var_name] = prm; - } - } - } - } + if (this->op_state_ == OpState::FULL) { + this->op_state_ = this->is_test_ ? OpState::FULL_TEST : OpState::FULL_TRAIN; + } else { + this->op_state_ = + this->is_test_ ? OpState::PARTIAL_TEST : OpState::PARTIAL_TRAIN; } -} -void NgraphEngine::BuildNgNodes() { - for (auto& op : fused_ops_) { - for (auto& var_name_item : op->Outputs()) { + int idx = interval[0]; + while (idx < interval[1]) { + this->fused_ops_.emplace_back( + framework::OpRegistry::CreateOp(*(ops_desc[idx]))); + ++idx; + } + while (ops_desc.at(idx)->Type() != framework::kFetchOpType) { + auto op_desc = ops_desc.at(idx); + for (auto& var_name_item : op_desc->Inputs()) { for (auto& var_name : var_name_item.second) { - if (var_node_map_->find(var_name) == var_node_map_->end()) { - auto* var = scope_.FindVar(var_name); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - auto& ddim = tensor_pd->dims(); - auto ng_shape = Ddim2Shape(ddim); - auto ng_type = var_type_map_.at(var_name); - auto prm = std::make_shared(ng_type, - ng_shape, true); - (*var_node_map_)[var_name] = prm; - } - } + this->post_op_inputs_.insert(var_name); } } + ++idx; } - NgraphBridge ngb(var_node_map_); - for (auto& op : fused_ops_) { - ngb.BuildNgNode(op); - } + + BuildNgIO(ops_desc, interval); } -void NgraphEngine::BuildNgIO() { +void NgraphEngine::BuildNgIO(const std::vector& ops_desc, + const std::vector& interval) { std::unordered_set inputs; std::unordered_set outputs; - - for (auto& op : fused_ops_) { + for (int i = interval[0]; i < interval[1]; ++i) { + auto op = ops_desc[i]; for (auto& var_name_item : op->Inputs()) { for (auto& var_name : var_name_item.second) { inputs.insert(var_name); @@ -301,15 +320,11 @@ void NgraphEngine::BuildNgIO() { std::find(var_in_.begin(), var_in_.end(), var_name) == var_in_.end()) { // fill var_in here to keep lhs and rhs order - var_in_.push_back(var_name); + this->var_in_.emplace_back(var_name); } } } - if (op->Type() != "fill_constant") { - GetNgInputShape(op); - } - for (auto& var_name_item : op->Outputs()) { PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, "op %s has more than 1 output - Not handling yet", @@ -321,171 +336,278 @@ void NgraphEngine::BuildNgIO() { } // var_out.clear(); - for (auto& op : fused_ops_) { + for (int i = interval[0]; i < interval[1]; ++i) { + auto op = ops_desc[i]; for (auto& var_name_item : op->Outputs()) { PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, "op %s has more than 1 output - Not handling yet", op->Type()); for (auto& var_name : var_name_item.second) { - switch (ng_op_state_) { + switch (this->op_state_) { case OpState::PARTIAL_TEST: if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || - fetches_.find(var_name) != fetches_.end()) { - var_out_.push_back(var_name); + find(fetch_vars.begin(), fetch_vars.end(), var_name) != + fetch_vars.end()) { + this->var_out_.emplace_back(var_name); } break; case OpState::FULL_TEST: - if (fetches_.find(var_name) != fetches_.end()) { - var_out_.push_back(var_name); + if (find(fetch_vars.begin(), fetch_vars.end(), var_name) != + fetch_vars.end()) { + this->var_out_.emplace_back(var_name); } break; case OpState::PARTIAL_TRAIN: - if (fetches_.find(var_name) != fetches_.end() || + if (find(fetch_vars.begin(), fetch_vars.end(), var_name) != + fetch_vars.end() || post_op_inputs_.find(var_name) != post_op_inputs_.end() || persistables_.find(var_name) != persistables_.end()) { - var_out_.push_back(var_name); + this->var_out_.emplace_back(var_name); } break; case OpState::FULL_TRAIN: - if (fetches_.find(var_name) != fetches_.end() || + if (find(fetch_vars.begin(), fetch_vars.end(), var_name) != + fetch_vars.end() || persistables_.find(var_name) != persistables_.end()) { - var_out_.push_back(var_name); + this->var_out_.emplace_back(var_name); } break; default: - var_out_.push_back(var_name); + this->var_out_.emplace_back(var_name); } } } } + for (size_t i = 0; i < var_in_.size(); ++i) { + auto var_name = var_in_[i]; + if (persistables_.find(var_name) == persistables_.end()) { + var_in_updates_.emplace_back(i); + } + } } -void NgraphEngine::BuildNgFunction() { +void NgraphEngine::GetNgInputShape() { + for (auto& var_name : var_in_) { + auto* var = scope_.FindVar(var_name); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto sp = Ddim2Shape(tensor_pd->dims()); + auto ng_type = var_type_map_[var_name]; + auto prm = std::make_shared(ng_type, sp, true); + (*var_node_map_)[var_name] = prm; + (*var_in_node_map_)[var_name] = prm; + } + } +} + +void NgraphEngine::BuildNgNodes() { + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Outputs()) { + for (auto& var_name : var_name_item.second) { + if (var_node_map_->find(var_name) == var_node_map_->end()) { + auto* var = scope_.FindVar(var_name); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto& ddim = tensor_pd->dims(); + auto ng_shape = Ddim2Shape(ddim); + auto ng_type = var_type_map_[var_name]; + auto prm = std::make_shared(ng_type, + ng_shape, true); + (*var_node_map_)[var_name] = prm; + } + } + } + } + } + + NgraphBridge ngb(var_node_map_); + for (auto& op : fused_ops_) { + ngb.BuildNgNode(op); + } +} + +void NgraphEngine::RunInferShape() { + for (auto& op : fused_ops_) { + framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); + op->RuntimeInferShape(scope_, place_, ctx); + } +} + +void NgraphEngine::BuildNgFunction(const std::vector& interval) { + Prepare(interval); + RunInferShape(); + GetNgInputShape(); BuildNgNodes(); ngraph_function_ = nullptr; ngraph::NodeVector func_outputs; ngraph::ParameterVector func_inputs; for (auto& vo : var_out_) { - func_outputs.push_back(var_node_map_->at(vo)); + func_outputs.emplace_back(var_node_map_->at(vo)); } for (auto& vi : var_in_) { std::shared_ptr prm = std::dynamic_pointer_cast( var_in_node_map_->at(vi)); - func_inputs.push_back(prm); + func_inputs.emplace_back(prm); } ngraph_function_ = std::make_shared(func_outputs, func_inputs); } -void NgraphEngine::GetNgFunction() { - bool cache_on = true; - if (cache_on) { - std::string input_shape_str; - for (auto& var_name : var_in_) { - auto shape = var_node_map_->at(var_name)->get_shape(); - for (size_t i = 0; i < shape.size(); ++i) { - input_shape_str += std::to_string(shape.at(i)); +void NgraphEngine::GetNgFunction(std::string engine_key, + const std::vector& interval) { + bool use_cache = true; + if (use_cache) { + this->func_cache_key_ = ""; + for (int i = 0; i < std::min(static_cast(feed_vars.size()), 10); ++i) { + auto* var = scope_.FindVar(feed_vars[i]); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto dims = tensor_pd->dims(); + for (int j = 0; j < dims.size(); ++j) { + func_cache_key_ += std::to_string(dims[j]); + } } } - func_cache_key_ = input_shape_str + func_cache_key_; - if (func_cache_.find(func_cache_key_) != func_cache_.end()) { - ngraph_function_ = func_cache_.at(func_cache_key_); - } else { - BuildNgFunction(); - func_cache_[func_cache_key_] = ngraph_function_; + func_cache_key_ += std::to_string(interval[0]) + "_" + + std::to_string(interval[1]) + engine_key; + func_cache_key_ = std::to_string(std::hash()(func_cache_key_)); + + if (engine_cache.find(func_cache_key_) != engine_cache.end()) { + if (engine_cache[func_cache_key_].persistables.size() == 0) { + engine_cache.clear(); + t_in_cache_.clear(); + } else { + auto var_name = engine_cache[func_cache_key_].persistables.begin(); + framework::Variable* var = scope_.FindVar(*var_name); + if (var != pre_var_ptr) { + engine_cache.clear(); + t_in_cache_.clear(); + } + pre_var_ptr = var; + } + } + + if (engine_cache.find(func_cache_key_) == engine_cache.end()) { + BuildNgFunction(interval); + engine_cache[func_cache_key_].ngraph_function = this->ngraph_function_; + engine_cache[func_cache_key_].persistables = this->persistables_; + engine_cache[func_cache_key_].var_in_updates = this->var_in_updates_; + engine_cache[func_cache_key_].var_in = this->var_in_; + engine_cache[func_cache_key_].var_out = this->var_out_; + engine_cache[func_cache_key_].is_test = this->is_test_; } } else { - BuildNgFunction(); + BuildNgFunction(interval); } } void NgraphEngine::Run(const framework::Scope& scope, const platform::Place& place) const { - std::vector> t_in; - std::vector> t_out; + std::shared_ptr ng_func; + const std::set* p_persistables; + const std::vector* p_var_in_updates; + const std::vector* p_var_in; + const std::vector* p_var_out; + bool is_test; + + bool use_cache = true; + if (use_cache) { + PADDLE_ENFORCE(engine_cache.find(func_cache_key_) != engine_cache.end(), + "Cannot find cached data to run ngraph function"); + ng_func = engine_cache[func_cache_key_].ngraph_function; + p_persistables = &(engine_cache[func_cache_key_].persistables); + p_var_in_updates = &(engine_cache[func_cache_key_].var_in_updates); + p_var_in = &(engine_cache[func_cache_key_].var_in); + p_var_out = &(engine_cache[func_cache_key_].var_out); + is_test = engine_cache[func_cache_key_].is_test; + } else { + ng_func = ngraph_function_; + p_persistables = &this->persistables_; + p_var_in_updates = &this->var_in_updates_; + p_var_in = &this->var_in_; + p_var_out = &this->var_out_; + is_test = this->is_test_; + } - for (size_t i = 0; i < var_in_.size(); ++i) { - auto vi = var_in_.at(i); - auto sp = var_node_map_->at(vi)->get_shape(); - std::shared_ptr ti; - auto* var = scope.FindVar(vi); - if (var && var->IsType()) { - auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); - PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), - "Ensure ngraph tensor layout align with paddle tensor"); - auto ng_type = var_type_map_.at(vi); - if (ng_type == ngraph::element::f32) { - auto pd_arr = tensor_pd->mutable_data(place); - ti = backend_->create_tensor(ngraph::element::f32, sp, pd_arr); - } else if (ng_type == ngraph::element::i32) { - const int* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::i32, sp, - const_cast(arr)); - } else if (ng_type == ngraph::element::i64) { - auto pd_arr = tensor_pd->mutable_data(place); - ti = backend_->create_tensor(ngraph::element::i64, sp, pd_arr); - } else if (ng_type == ngraph::element::f64) { - auto pd_arr = tensor_pd->mutable_data(place); - ti = backend_->create_tensor(ngraph::element::f64, sp, pd_arr); - } else if (ng_type == ngraph::element::boolean) { - auto pd_arr = tensor_pd->mutable_data(place); - ti = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr); + std::vector>* p_t_in; + std::vector> t_in = {}; + + auto m_parameters = ng_func->get_parameters(); + auto m_results = ng_func->get_results(); + if (is_test && use_cache && + t_in_cache_.find(func_cache_key_) != t_in_cache_.end()) { + p_t_in = &(t_in_cache_[func_cache_key_]); + for (size_t i = 0; i < p_var_in_updates->size(); ++i) { + int index = p_var_in_updates->at(i); + auto vi = p_var_in->at(index); + auto sp = m_parameters[index]->get_shape(); + auto ng_type = m_parameters[index]->get_element_type(); + std::shared_ptr ti; + auto* var = scope.FindVar(vi); + if (var && var->IsType()) { + auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); + void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]); + ti = backend_->create_tensor(ng_type, sp, pd_arr); + (*p_t_in)[index] = ti; } else { - PADDLE_THROW("Data type not handling for var %s", vi); + PADDLE_THROW("Cannot find var or tensor with var name %s", vi); } + } + } else { + if (is_test && use_cache) { + p_t_in = &(t_in_cache_[func_cache_key_]); } else { - PADDLE_THROW("Cannot find var or tensor with var name %s", vi); + p_t_in = &t_in; } - bool is_test = (ng_op_state_ == OpState::PARTIAL_TEST || - ng_op_state_ == OpState::FULL_TEST) - ? true - : false; - bool is_persistable = - (persistables_.find(vi) != persistables_.end()) ? true : false; - if (is_test && is_persistable) { - ti->set_stale(false); + + for (size_t i = 0; i < p_var_in->size(); ++i) { + auto vi = p_var_in->at(i); + auto sp = m_parameters[i]->get_shape(); + auto ng_type = m_parameters[i]->get_element_type(); + std::shared_ptr ti; + auto* var = scope.FindVar(vi); + if (var && var->IsType()) { + auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); + void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]); + PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), + "Ensure ngraph tensor layout align with paddle tensor"); + ti = backend_->create_tensor(ng_type, sp, pd_arr); + } else { + PADDLE_THROW("Cannot find var or tensor with var name %s", vi); + } + bool is_persistable = + (p_persistables->find(vi) != p_persistables->end()) ? true : false; + if (is_test && is_persistable) { + ti->set_stale(false); + } + (*p_t_in).emplace_back(ti); } - t_in.push_back(ti); } - for (size_t i = 0; i < var_out_.size(); ++i) { - auto vo = var_out_[i]; + std::vector> t_out = {}; + for (size_t i = 0; i < p_var_out->size(); ++i) { + auto vo = p_var_out->at(i); auto* var = scope.FindVar(vo); - std::shared_ptr to; if (var && var->IsType()) { + auto sp = m_results[i]->get_shape(); + var->GetMutable()->Resize(Shape2Ddim(sp)); auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); - auto dd = tensor_pd->dims(); - ngraph::Shape sp = Ddim2Shape(dd); - auto ng_type = var_type_map_.at(vo); - if (ng_type == ngraph::element::f32) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ng_type, sp, pd_arr); - } else if (ng_type == ngraph::element::i64) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ng_type, sp, pd_arr); - } else if (ng_type == ngraph::element::i32) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ng_type, sp, pd_arr); - } else if (ng_type == ngraph::element::f64) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ng_type, sp, pd_arr); - } else if (ng_type == ngraph::element::boolean) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ng_type, sp, pd_arr); - } else { - PADDLE_THROW("Data type not handled in for var %s", vo); - } - t_out.push_back(to); + auto ng_type = m_results[i]->get_element_type(); + void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]); + std::shared_ptr to = + backend_->create_tensor(ng_type, sp, pd_arr); + t_out.emplace_back(to); } else { PADDLE_THROW("Cannot find var or tensor with var name %s", vo); } } - backend_->call(backend_->compile(ngraph_function_), t_out, t_in); + auto handle = backend_->compile(ng_func); + handle->call_with_validate(t_out, *p_t_in); } // NgraphEngine::Run } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h index bf5ff2a743b0edb69163e674d36c56a02c0b4153..fef51464b5702e61d052f28050f6aefaecf0f615 100644 --- a/paddle/fluid/operators/ngraph/ngraph_engine.h +++ b/paddle/fluid/operators/ngraph/ngraph_engine.h @@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifndef PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_ +#define PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_ +#include +#include #include #include +#include #include #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_desc.h" #include "ngraph/ngraph.hpp" @@ -33,29 +39,47 @@ enum class OpState { /* nGraph support state on ops */ UNKNOWN /* Output all for debug purpose */ }; +// cache engine repetitives +struct EngineCache { + std::shared_ptr ngraph_function; + std::set persistables; + std::vector var_in; + std::vector var_out; + std::vector var_in_updates; + bool is_test = true; +}; + // perform graph build through bridge and execute computation class NgraphEngine { public: explicit NgraphEngine(const framework::Scope& scope, const platform::Place& place, - const std::string& serialized_graph, - const std::vector& interval); + const framework::ExecutionContext& ctx); void Run(const framework::Scope& scope, const platform::Place& place) const; - static void EnableNgraph(const framework::ProgramDesc& program); + static const framework::BlockDesc* p_bdesc; + static std::vector feed_vars, fetch_vars; + + static void FuseNgraphOps( + const framework::BlockDesc& prog, + std::vector>* ops); private: - static std::unordered_map> - func_cache_; + static std::unordered_map engine_cache; + static std::unordered_map< + std::string, std::vector>> + t_in_cache_; + static framework::Variable* pre_var_ptr; + const framework::Scope& scope_; const platform::Place& place_; std::vector> fused_ops_; std::unordered_map var_type_map_; - std::unordered_set persistables_; - std::unordered_set fetches_; + std::set persistables_; std::unordered_set post_op_inputs_; - OpState ng_op_state_ = OpState::UNKNOWN; + OpState op_state_ = OpState::UNKNOWN; + bool is_test_{true}; std::string func_cache_key_; // ngraph backend eg. CPU @@ -66,6 +90,8 @@ class NgraphEngine { std::vector var_in_; // var_name of outputs from fetch in order std::vector var_out_; + // non-persitable var_in + std::vector var_in_updates_; // map input vars to nodes std::shared_ptr< std::unordered_map>> @@ -74,20 +100,23 @@ class NgraphEngine { std::shared_ptr< std::unordered_map>> var_node_map_; - // prepare info for nraph engine - void Prepare(const framework::BlockDesc& block, - const std::vector& interval); + // prepare info for ngraph engine need + void Prepare(const std::vector& interval); + // get ngraph engine input and output list + void BuildNgIO(const std::vector& op_descs, + const std::vector& interval); // get ngraph input and define ngraph input parameters - void GetNgInputShape(std::shared_ptr op); + void GetNgInputShape(); // Call ngraph bridge to map ops void BuildNgNodes(); - // get the ngraph input and output var list - void BuildNgIO(); + // run paddle RuntimeInferShape to get the tensor shape + void RunInferShape(); // build ngraph function call - void BuildNgFunction(); + void BuildNgFunction(const std::vector& interval); // Check cache for ngraph function or otherwise build the function - void GetNgFunction(); + void GetNgFunction(std::string engine_key, const std::vector& interval); }; } // namespace operators } // namespace paddle +#endif // PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_ diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc index 3051ca123b29658d3e9a35239ad00f621a297cb5..f941f917c82b3b74a35739c08112233fd0a3477c 100644 --- a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc +++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc @@ -29,6 +29,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Xs", "A list of inputs.").AsDispensable(); AddOutput("Ys", "A list of outputs").AsDispensable(); AddAttr("graph", "the graph."); + AddAttr("engine_key", "the engine hash key."); AddAttr>("interval", "op interval supported by ngraph"); AddComment("ngraph engine operator."); } diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.h b/paddle/fluid/operators/ngraph/ngraph_engine_op.h index 2f194a9b8766316fc645f7e22e21fff048fb7d63..c9b2a3970e17c1a06fa0cc67aa15df304a30656e 100644 --- a/paddle/fluid/operators/ngraph/ngraph_engine_op.h +++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h @@ -46,10 +46,8 @@ class NgraphEngineKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto& scope = ctx.scope(); auto place = ctx.GetPlace(); - std::string serialized_graph = ctx.Attr("graph"); - auto interval = ctx.Attr>("interval"); - NgraphEngine ngraph_engine(scope, place, serialized_graph, interval); + NgraphEngine ngraph_engine(scope, place, ctx); ngraph_engine.Run(scope, place); } }; diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h deleted file mode 100644 index c7d7392080cdc82f1d59314337192ad8ea5fa2d1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/ngraph/ngraph_ops.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* - * This file contains the list of the ngraph operators for Paddle. - * - * ATTENTION: It requires some C++11 features, for lower version C++ or C, we - * might release another API. - */ - -#pragma once - -#include "ops/accuracy_op.h" -#include "ops/activation_op.h" -#include "ops/batch_norm_op.h" -#include "ops/binary_unary_op.h" -#include "ops/conv2d_op.h" -#include "ops/elementwise_add_op.h" -#include "ops/fill_constant_op.h" -#include "ops/mean_op.h" -#include "ops/mul_op.h" -#include "ops/pool2d_op.h" -#include "ops/scale_op.h" -#include "ops/softmax_op.h" -#include "ops/sum_op.h" -#include "ops/top_k_op.h" diff --git a/paddle/fluid/operators/ngraph/ops/CMakeLists.txt b/paddle/fluid/operators/ngraph/ops/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7dee3308b74a70a2daf35055d3ac80a14de99ac1 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/CMakeLists.txt @@ -0,0 +1,8 @@ +file(GLOB LIST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.h") +set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/ngraph/ngraph_ops.h) +file(APPEND ${pass_file} "\#pragma once\n") +file(WRITE ${pass_file} "// Generated by the /paddle/fluid/operators/ngraph/ops/CMakeLists.txt. DO NOT EDIT!\n\n") + +foreach(OPS_NAME ${LIST_OPS}) + file(APPEND ${pass_file} "\#include \"paddle/fluid/operators/ngraph/ops/${OPS_NAME}\"\n") +endforeach(OPS_NAME) diff --git a/paddle/fluid/operators/ngraph/ops/accuracy_op.h b/paddle/fluid/operators/ngraph/ops/accuracy_op.h index bf37ce48d8c2ce3b97fac154be9d1dfb08421f97..0da57517a733985ce1208732f13b08cd7bb8ca30 100644 --- a/paddle/fluid/operators/ngraph/ops/accuracy_op.h +++ b/paddle/fluid/operators/ngraph/ops/accuracy_op.h @@ -14,9 +14,12 @@ limitations under the License. */ #pragma once +#include #include +#include #include #include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { @@ -63,3 +66,5 @@ void BuildAccuracyNode( } // namespace ngraphs } // namespace operators } // namespace paddle + +REGISTER_NG_OP(accuracy, BuildAccuracyNode); diff --git a/paddle/fluid/operators/ngraph/ops/activation_op.h b/paddle/fluid/operators/ngraph/ops/activation_op.h index f66080e3aabc05d3ce5ecaa3791de4410e34fa37..a66ec65a336f807f554157628888633db22ebfec 100644 --- a/paddle/fluid/operators/ngraph/ops/activation_op.h +++ b/paddle/fluid/operators/ngraph/ops/activation_op.h @@ -14,9 +14,12 @@ limitations under the License. */ #pragma once +#include #include +#include #include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { @@ -50,3 +53,6 @@ void BuildTanhGradNode( } // namespace ngraphs } // namespace operators } // namespace paddle + +REGISTER_NG_OP(relu_grad, BuildReluGradNode); +REGISTER_NG_OP(tanh_grad, BuildTanhGradNode); diff --git a/paddle/fluid/operators/ngraph/ops/adam_op.h b/paddle/fluid/operators/ngraph/ops/adam_op.h new file mode 100644 index 0000000000000000000000000000000000000000..beba5d3d237d4dea578651f440b65a15251d5ad2 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/adam_op.h @@ -0,0 +1,79 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildAdamNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = framework::AttrReader(op->Attrs()); + auto beta1pow = platform::GetInputNode(op, "Beta1Pow", ngb_node_map); + auto beta2pow = platform::GetInputNode(op, "Beta2Pow", ngb_node_map); + auto grad = platform::GetInputNode(op, "Grad", ngb_node_map); + auto learning_rate = platform::GetInputNode(op, "LearningRate", ngb_node_map); + auto moment1 = platform::GetInputNode(op, "Moment1", ngb_node_map); + auto moment2 = platform::GetInputNode(op, "Moment2", ngb_node_map); + auto param = platform::GetInputNode(op, "Param", ngb_node_map); + + auto epsilon = op_attrs.Get("epsilon"); + auto beta2 = op_attrs.Get("beta2"); + auto beta1 = op_attrs.Get("beta1"); + + auto moment1_shape = moment1->get_shape(); + auto grad_shape = grad->get_shape(); + + auto moment1out = std::make_shared( + ElementwiseScalar(beta1, moment1), + ElementwiseScalar(1. - beta1, grad)); + + auto grad_square = std::make_shared(grad, grad); + auto moment2out = std::make_shared( + ElementwiseScalar(beta2, moment2), + ElementwiseScalar(1. - beta2, grad_square)); + auto node_sqrt = std::make_shared( + ElementwiseScalar(1., beta2pow)); + auto lr = std::make_shared( + node_sqrt, ElementwiseScalar(1., beta1pow)); + auto updated_lr = std::make_shared(learning_rate, lr); + + auto moment2_sqrt = std::make_shared(moment2out); + auto param_grad = std::make_shared( + moment1out, ElementwiseScalar(epsilon, moment2_sqrt)); + auto delta = ElementwiseScalar(updated_lr, param_grad); + auto param_out = std::make_shared(param, delta); + + platform::SetOutputNode(op, "Moment1Out", moment1out, ngb_node_map); + platform::SetOutputNode(op, "Moment2Out", moment2out, ngb_node_map); + platform::SetOutputNode(op, "ParamOut", param_out, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle + +REGISTER_NG_OP(adam, BuildAdamNode); diff --git a/paddle/fluid/operators/ngraph/ops/batch_norm_op.h b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h index 2cdd0299760dadc228fb9121585363b23652789a..01fe78cdb24652429f713d09ea2abb8c73bbddf5 100644 --- a/paddle/fluid/operators/ngraph/ops/batch_norm_op.h +++ b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h @@ -14,12 +14,15 @@ limitations under the License. */ #pragma once +#include #include +#include #include #include "ngraph/ngraph.hpp" #include "paddle/fluid/operators/ngraph/ops/elementwise_node.h" #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { @@ -44,6 +47,10 @@ void BuildBatchNormNode( const float epsilon = op_attrs.Get("epsilon"); const float momentum = op_attrs.Get("momentum"); + PADDLE_ENFORCE( + data_layout == "NHWC" || data_layout == "NCHW" || data_layout == "NC", + "The BatchNorm operator only supports NHWC/NCHW/NC data format"); + if (data_layout == "NHWC") { x = paddle::platform::Nhwc2Nchw(x); } @@ -110,6 +117,9 @@ void BuildBatchNormGradNode( "BN grap input size needs to be 2 or 4"); PADDLE_ENFORCE_EQ(x_shape.size(), dy_shape.size(), "BN grap input and delta size needs to be equal"); + PADDLE_ENFORCE( + data_layout == "NHWC" || data_layout == "NCHW" || data_layout == "NC", + "The BatchNorm operator only supports NHWC/NCHW/NC data format"); if (x_shape.size() == 2) { x = std::make_shared( @@ -148,3 +158,6 @@ void BuildBatchNormGradNode( } // namespace ngraphs } // namespace operators } // namespace paddle + +REGISTER_NG_OP(batch_norm, BuildBatchNormNode); +REGISTER_NG_OP(batch_norm_grad, BuildBatchNormGradNode); diff --git a/paddle/fluid/operators/ngraph/ops/binary_unary_op.h b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h index 0c0d25d0cd1ae536618057ce80388b8eeb81c68a..2d11775849a778262dcd3e36ff35d8851fb350f1 100644 --- a/paddle/fluid/operators/ngraph/ops/binary_unary_op.h +++ b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h @@ -14,8 +14,11 @@ limitations under the License. */ #pragma once +#include #include +#include #include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { @@ -47,3 +50,7 @@ static void BuildUnaryNode( } // namespace ngraphs } // namespace operators } // namespace paddle + +REGISTER_NG_OP(relu, BuildUnaryNode); +REGISTER_NG_OP(tanh, BuildUnaryNode); +REGISTER_NG_OP(sigmoid, BuildUnaryNode); diff --git a/paddle/fluid/operators/ngraph/ops/concat_op.h b/paddle/fluid/operators/ngraph/ops/concat_op.h new file mode 100644 index 0000000000000000000000000000000000000000..27d796851501b9158e1ce7f6415b4d5373e88e2d --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/concat_op.h @@ -0,0 +1,50 @@ +/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildConcatNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + std::vector> args; + for (auto& var_name_item : op->Inputs()) { + for (auto& var_name : var_name_item.second) { + auto& node0 = ngb_node_map->at(var_name); + args.push_back(node0); + } + } + auto op_attrs = framework::AttrReader(op->Attrs()); + const size_t axis = op_attrs.Get("axis"); + auto out = std::make_shared(args, axis); + platform::SetOutputNode(op, "Out", out, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle + +REGISTER_NG_OP(concat, BuildConcatNode); diff --git a/paddle/fluid/operators/ngraph/ops/conv2d_op.h b/paddle/fluid/operators/ngraph/ops/conv2d_op.h index 46fb2703f51482afa0546f08b8fc7b2c98e281bc..be766ebeb4796be102c917296238b8ab14710131 100644 --- a/paddle/fluid/operators/ngraph/ops/conv2d_op.h +++ b/paddle/fluid/operators/ngraph/ops/conv2d_op.h @@ -14,9 +14,12 @@ limitations under the License. */ #pragma once +#include #include +#include #include #include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { @@ -233,3 +236,6 @@ void BuildConv2dGradNode( } // namespace ngraphs } // namespace operators } // namespace paddle + +REGISTER_NG_OP(conv2d, BuildConv2dNode); +REGISTER_NG_OP(conv2d_grad, BuildConv2dGradNode); diff --git a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h new file mode 100644 index 0000000000000000000000000000000000000000..be36b9d21ef6ebe5c11d783462e7dc564afe2aba --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h @@ -0,0 +1,151 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildCrossEntropyNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map); + auto label_shape = label->get_shape(); + auto x_shape = x->get_shape(); + auto label_rank = label_shape.size(); + auto x_rank = x_shape.size(); + std::shared_ptr x_2d = x, label_2d = label; + auto label_2d_shape = label_shape, x_2d_shape = x_shape; + + if (label_rank > 2) { + label_2d_shape = paddle::platform::FlattenTo2d(label_shape, label_rank - 1); + label_2d = paddle::platform::NgReshaper(label, label_2d_shape); + } + if (x_rank > 2) { + x_2d_shape = paddle::platform::FlattenTo2d(x_shape, x_rank - 1); + x_2d = paddle::platform::NgReshaper(x, x_2d_shape); + } + + auto batch_size = x_2d_shape.at(0); + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + const bool is_soft_label = op_attrs.Get("soft_label"); + + std::shared_ptr node_1_hot = label_2d; + if (!is_soft_label) { + auto label_1d = paddle::platform::NgReshaper( + label_2d, ngraph::Shape{label_2d_shape.at(0)}); + node_1_hot = std::make_shared(label_1d, x_2d_shape, 1); + } + if (x->get_element_type() != node_1_hot->get_element_type()) { + node_1_hot = std::make_shared(node_1_hot, + x->get_element_type()); + } + + auto node_log = std::make_shared(x_2d); + auto high_clip = ngraph::op::Constant::create(node_log->get_element_type(), + node_log->get_shape(), {1e20}); + auto low_clip = ngraph::op::Constant::create(node_log->get_element_type(), + node_log->get_shape(), {-1e20}); + auto node_min = std::make_shared(node_log, high_clip); + auto node_max = std::make_shared(node_min, low_clip); + auto node_mul = node_1_hot * node_log; + auto node_sum = + std::make_shared(node_mul, ngraph::AxisSet{1}); + auto node_neg = std::make_shared(node_sum); + auto xe = + paddle::platform::NgReshaper(node_neg, ngraph::Shape{batch_size, 1}); + + if (!is_soft_label) { + auto ignore_index = op_attrs.Get("ignore_index"); + auto ignore_node = ngraph::op::Constant::create( + label->get_element_type(), label_2d_shape, {ignore_index}); + auto not_equal_node = + std::make_shared(label_2d, ignore_node); + auto mask = std::make_shared(not_equal_node, + xe->get_element_type()); + xe = xe * mask; + } + + paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map); +} + +void BuildCrossEntropyGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + const bool is_soft_label = op_attrs.Get("soft_label"); + + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map); + auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map); + auto x_shape = x->get_shape(); + auto rank = x_shape.size(); + + std::shared_ptr mask; + if (!is_soft_label) { + auto label_shape = label->get_shape(); + label_shape.pop_back(); + label = paddle::platform::NgReshaper(label, label_shape); + + auto ignore_index = op_attrs.Get("ignore_index"); + auto ignore_node = ngraph::op::Constant::create( + label->get_element_type(), label_shape, {ignore_index}); + auto not_equal_node = + std::make_shared(label, ignore_node); + mask = std::make_shared(not_equal_node, + x->get_element_type()); + mask = std::make_shared(mask, x_shape, + ngraph::AxisSet{rank - 1}); + + label = std::make_shared(label, x_shape, rank - 1); + } + + auto dy_shape = dy->get_shape(); + dy_shape.pop_back(); + auto dy_reshape = paddle::platform::NgReshaper(dy, dy_shape); + auto dy_bcast = std::make_shared( + dy_reshape, x_shape, ngraph::AxisSet{rank - 1}); + if (x->get_element_type() != label->get_element_type()) { + label = std::make_shared(label, x->get_element_type()); + } + + auto xe_grad = -label * dy_bcast / x; + + if (!is_soft_label) { + xe_grad = xe_grad * mask; + } + + paddle::platform::SetOutputNode(op, "X@GRAD", xe_grad, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle + +REGISTER_NG_OP(cross_entropy, BuildCrossEntropyNode); +REGISTER_NG_OP(cross_entropy_grad, BuildCrossEntropyGradNode); diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h b/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h index 868df51e16a9714a750bac64dadc3441de79165e..d7485a706a193a52113cb993a3604c444b4303c0 100644 --- a/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h +++ b/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h @@ -14,11 +14,14 @@ limitations under the License. */ #pragma once +#include #include +#include #include #include "ngraph/ngraph.hpp" #include "paddle/fluid/operators/ngraph/ops/elementwise_node.h" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { @@ -85,3 +88,6 @@ void BuildElementwiseAddGradNode( } // namespace ngraphs } // namespace operators } // namespace paddle + +REGISTER_NG_OP(elementwise_add, BuildElementwiseAddNode); +REGISTER_NG_OP(elementwise_add_grad, BuildElementwiseAddGradNode); diff --git a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h index 406a4314f89810df192280cc97de245553d5520f..42c2df5259242b7ae28613ab12c237834febc574 100644 --- a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h +++ b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h @@ -14,9 +14,12 @@ limitations under the License. */ #pragma once +#include #include +#include #include #include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { @@ -46,8 +49,6 @@ void BuildFillConstantNode( ng_dtype = ngraph::element::i64; } else if (data_type == paddle::framework::proto::VarType::INT32) { ng_dtype = ngraph::element::i32; - } else if (data_type == paddle::framework::proto::VarType::BOOL) { - ng_dtype = ngraph::element::boolean; } else { PADDLE_THROW("unsupported data type: %s", data_type); } @@ -57,3 +58,5 @@ void BuildFillConstantNode( } // namespace ngraphs } // namespace operators } // namespace paddle + +REGISTER_NG_OP(fill_constant, BuildFillConstantNode); diff --git a/paddle/fluid/operators/ngraph/ops/mean_op.h b/paddle/fluid/operators/ngraph/ops/mean_op.h index 4c44bc4c112f401c2707f7babd49a33f238a768f..86e697d260eb0f26428258b5faea958a7319948c 100644 --- a/paddle/fluid/operators/ngraph/ops/mean_op.h +++ b/paddle/fluid/operators/ngraph/ops/mean_op.h @@ -15,10 +15,13 @@ limitations under the License. */ #pragma once #include +#include #include +#include #include "ngraph/ngraph.hpp" #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { @@ -64,3 +67,6 @@ void BuildMeanGradNode( } // namespace ngraphs } // namespace operators } // namespace paddle + +REGISTER_NG_OP(mean, BuildMeanNode); +REGISTER_NG_OP(mean_grad, BuildMeanGradNode); diff --git a/paddle/fluid/operators/ngraph/ops/momentum_op.h b/paddle/fluid/operators/ngraph/ops/momentum_op.h new file mode 100644 index 0000000000000000000000000000000000000000..84bddacba89d2921bca4915af7f64dcfbfdd42db --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/momentum_op.h @@ -0,0 +1,106 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildMomentumNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + auto param = paddle::platform::GetInputNode(op, "Param", ngb_node_map); + auto grad = paddle::platform::GetInputNode(op, "Grad", ngb_node_map); + auto velocity = paddle::platform::GetInputNode(op, "Velocity", ngb_node_map); + auto learning_rate = + paddle::platform::GetInputNode(op, "LearningRate", ngb_node_map); + + auto mu = op_attrs.Get("mu"); + bool use_nesterov = op_attrs.Get("use_nesterov"); + + auto param_shape = param->get_shape(); + auto velocity_shape = velocity->get_shape(); + auto grad_shape = grad->get_shape(); + auto lr_shape = learning_rate->get_shape(); + + auto shape_velocity = ngraph::Shape{velocity_shape}; + auto mu_create = + ngraph::op::Constant::create(ngraph::element::f32, shape_velocity, {mu}); + + auto vel_mul = std::make_shared(velocity, mu_create); + auto vel_out = std::make_shared(vel_mul, grad); + + ngraph::NodeVector result; + if (use_nesterov) { + auto mul_res = std::make_shared(vel_out, mu_create); + auto add_res = std::make_shared(grad, mul_res); + + auto add_2d = paddle::platform::FlattenTo2d(add_res->get_shape(), 0); + auto vel_reshape = paddle::platform::NgReshaper(vel_out, add_2d); + + auto lr_bcast = std::make_shared( + learning_rate, vel_reshape->get_shape(), + ngraph::AxisSet{vel_reshape->get_shape().size() - 1}); + + auto lr_1d = paddle::platform::FlattenTo1d(lr_bcast->get_shape(), 0); + auto lr_reshape = std::make_shared( + lr_bcast, ngraph::AxisVector{0, 1}, lr_1d); + + lr_reshape = std::make_shared( + lr_reshape, ngraph::AxisVector{0}, param->get_shape()); + + auto mul_res1 = std::make_shared(add_res, lr_reshape); + auto res = std::make_shared(param, mul_res1); + paddle::platform::SetOutputNode(op, "ParamOut", res, ngb_node_map); + } else { + auto vel_2d = paddle::platform::FlattenTo2d(vel_out->get_shape(), 0); + auto vel_reshape = paddle::platform::NgReshaper(vel_out, vel_2d); + + auto lr_bcast = std::make_shared( + learning_rate, vel_reshape->get_shape(), + ngraph::AxisSet{vel_reshape->get_shape().size() - 1}); + + auto lr_1d = paddle::platform::FlattenTo1d(lr_bcast->get_shape(), 0); + auto lr_reshape = std::make_shared( + lr_bcast, ngraph::AxisVector{0, 1}, lr_1d); + + lr_reshape = std::make_shared( + lr_reshape, ngraph::AxisVector{0}, param->get_shape()); + + auto mul_result = + std::make_shared(lr_reshape, vel_out); + + auto res = std::make_shared(param, mul_result); + paddle::platform::SetOutputNode(op, "ParamOut", res, ngb_node_map); + } + paddle::platform::SetOutputNode(op, "VelocityOut", vel_out, ngb_node_map); +} + +} // namespace ngraphs +} // namespace operators +} // namespace paddle + +REGISTER_NG_OP(momentum, BuildMomentumNode); diff --git a/paddle/fluid/operators/ngraph/ops/mul_op.h b/paddle/fluid/operators/ngraph/ops/mul_op.h index 4a6cbebe245f891c6c33b2116330a41d89d50e25..d13665864b8950436298b7cf685c803593007803 100644 --- a/paddle/fluid/operators/ngraph/ops/mul_op.h +++ b/paddle/fluid/operators/ngraph/ops/mul_op.h @@ -14,8 +14,11 @@ limitations under the License. */ #pragma once +#include #include +#include #include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { @@ -130,3 +133,6 @@ static void BuildMulGradNode( } // namespace ngraphs } // namespace operators } // namespace paddle + +REGISTER_NG_OP(mul, BuildMulNode); +REGISTER_NG_OP(mul_grad, BuildMulGradNode); diff --git a/paddle/fluid/operators/ngraph/ops/op_bridge.h b/paddle/fluid/operators/ngraph/ops/op_bridge.h new file mode 100644 index 0000000000000000000000000000000000000000..93df0ad8062745380d9cd4ca5027bef1425083bf --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/op_bridge.h @@ -0,0 +1,84 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include +#include +#include + +#include "ngraph/node.hpp" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/ngraph/ngraph_bridge.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace ops { + +class NgraphSingleton { + NgraphSingleton() = default; + NgraphSingleton(NgraphSingleton const&) = delete; + void operator=(NgraphSingleton const) = delete; + + ~NgraphSingleton() = default; + + static std::map< + std::string, + std::function&, + std::shared_ptr>>)>> + ng_node_maps_; + + public: + template + static void Register(TF&& tf, const std::string& name) { + ng_node_maps_[name] = tf; + } + + static bool Lookup(const std::string& name) { + auto it = ng_node_maps_.find(name); + if (it == ng_node_maps_.end()) { + return true; + } + return false; + } + + static void BuildNode( + const std::shared_ptr>>& ng_maps, + const std::shared_ptr& op, + const std::string& name) { + ng_node_maps_[name](op, ng_maps); + } +}; + +std::map&, + std::shared_ptr>>)>> + NgraphSingleton::ng_node_maps_; + +} // namespace ops +} // namespace operators +} // namespace paddle + +#define REGISTER_NG_OP(op_type__, Converter__) \ + struct ng_##op_type__##_converter { \ + ng_##op_type__##_converter() { \ + paddle::operators::ops::NgraphSingleton::Register( \ + paddle::operators::ngraphs::Converter__, #op_type__); \ + } \ + }; \ + ng_##op_type__##_converter ng_##op_type__##_converter__; diff --git a/paddle/fluid/operators/ngraph/ops/pool2d_op.h b/paddle/fluid/operators/ngraph/ops/pool2d_op.h index 836c9d6c185b305d3dd4c9e9d30e23abb0c1431c..c7b9c9316171a448d16ed68339f5754d25f3cabd 100644 --- a/paddle/fluid/operators/ngraph/ops/pool2d_op.h +++ b/paddle/fluid/operators/ngraph/ops/pool2d_op.h @@ -14,10 +14,13 @@ limitations under the License. */ #pragma once +#include #include +#include #include #include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { @@ -172,3 +175,6 @@ void BuildPool2dGradNode( } // namespace ngraphs } // namespace operators } // namespace paddle + +REGISTER_NG_OP(pool2d, BuildPool2dNode); +REGISTER_NG_OP(pool2d_grad, BuildPool2dGradNode); diff --git a/paddle/fluid/operators/ngraph/ops/scale_op.h b/paddle/fluid/operators/ngraph/ops/scale_op.h index 91a57d0be606373e985a30b7ac9c73648062d8e4..1461b85b16ece79548f3ca95be811fb31136c610 100644 --- a/paddle/fluid/operators/ngraph/ops/scale_op.h +++ b/paddle/fluid/operators/ngraph/ops/scale_op.h @@ -14,9 +14,12 @@ limitations under the License. */ #pragma once +#include #include +#include #include "ngraph/ngraph.hpp" #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { @@ -37,3 +40,5 @@ void BuildScaleNode( } // namespace ngraphs } // namespace operators } // namespace paddle + +REGISTER_NG_OP(scale, BuildScaleNode); diff --git a/paddle/fluid/operators/ngraph/ops/softmax_op.h b/paddle/fluid/operators/ngraph/ops/softmax_op.h index fc6395c08bc6b00990679c5327c3152a980be821..7d5720c460c4194ce06670a715b8d7ff4435bb2a 100644 --- a/paddle/fluid/operators/ngraph/ops/softmax_op.h +++ b/paddle/fluid/operators/ngraph/ops/softmax_op.h @@ -14,10 +14,13 @@ limitations under the License. */ #pragma once +#include #include +#include #include #include "ngraph/ngraph.hpp" #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { @@ -72,3 +75,6 @@ void BuildSoftmaxGradNode( } // namespace ngraphs } // namespace operators } // namespace paddle + +REGISTER_NG_OP(softmax, BuildSoftmaxNode); +REGISTER_NG_OP(softmax_grad, BuildSoftmaxGradNode); diff --git a/paddle/fluid/operators/ngraph/ops/sum_op.h b/paddle/fluid/operators/ngraph/ops/sum_op.h index 97f4ce64aa58bfa8cb70c36f9a12b7b8135da637..ab8cdb8f4d847c0acb60b39d07dc83f085b60bbd 100644 --- a/paddle/fluid/operators/ngraph/ops/sum_op.h +++ b/paddle/fluid/operators/ngraph/ops/sum_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { @@ -53,3 +54,5 @@ void BuildSumNode( } // namespace ngraphs } // namespace operators } // namespace paddle + +REGISTER_NG_OP(sum, BuildSumNode); diff --git a/paddle/fluid/operators/ngraph/ops/top_k_op.h b/paddle/fluid/operators/ngraph/ops/top_k_op.h index 852ecd7139a3c7046e78265ca021b2ce286c63c0..cdc26f6afd58700c3a1f57fa955d60bc8925d2d1 100644 --- a/paddle/fluid/operators/ngraph/ops/top_k_op.h +++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h @@ -14,8 +14,11 @@ limitations under the License. */ #pragma once +#include #include +#include #include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" #include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { @@ -42,3 +45,5 @@ void BuildTopKNode( } // namespace ngraphs } // namespace operators } // namespace paddle + +REGISTER_NG_OP(top_k, BuildTopKNode); diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index 98bae5e1d329005f9463fd7bb0751c44952dea88..5dd5f67e004c63e294152239ab7bd3db26542eed 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/jit/kernels.h" namespace paddle { namespace operators { @@ -32,53 +33,59 @@ class SGDOpKernel : public framework::OpKernel { if (param_var->IsType()) { const auto *param = ctx.Input("Param"); auto *param_out = ctx.Output("ParamOut"); - // Actually, all tensors are LoDTensor except SelectedRows. if (grad_var->IsType()) { - param_out->mutable_data(ctx.GetPlace()); const auto *grad = ctx.Input("Grad"); - - auto p = framework::EigenVector::Flatten(*param); - auto g = framework::EigenVector::Flatten(*grad); - auto o = framework::EigenVector::Flatten(*param_out); - auto *lr = learning_rate->data(); - - o = p - lr[0] * g; + auto sz = param_out->numel(); + PADDLE_ENFORCE_EQ(param->numel(), sz); + PADDLE_ENFORCE_EQ(grad->numel(), sz); + + jit::sgd_attr_t attr(1, sz, 1, sz, 1); + const T *lr = learning_rate->data(); + const T *param_data = param->data(); + const T *grad_data = grad->data(); + int64_t rows_idx = 0; + T *out_data = param_out->mutable_data(ctx.GetPlace()); + + auto sgd = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); + sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr); } else if (grad_var->IsType()) { // TODO(qijun): In Sparse SGD operator, in-place update is enforced. // This manual optimization brings difficulty to track data dependency. // It's better to find a more elegant solution. PADDLE_ENFORCE_EQ(param, param_out); const auto *grad = ctx.Input("Grad"); + auto &grad_rows = grad->rows(); // for distributed training, a sparse var may be empty, // just skip updating. - if (grad->rows().size() == 0) { + if (grad_rows.size() == 0) { return; } - auto grad_height = grad->height(); auto out_dims = param_out->dims(); - PADDLE_ENFORCE_EQ(grad_height, out_dims[0]); - + PADDLE_ENFORCE_EQ(grad->height(), out_dims[0]); auto &grad_value = grad->value(); - auto &grad_rows = grad->rows(); - - size_t grad_row_numel = grad_value.numel() / grad_rows.size(); - PADDLE_ENFORCE_EQ(static_cast(grad_row_numel), - param_out->numel() / grad_height); - - auto *grad_data = grad_value.data(); - auto *out_data = param_out->data(); - auto *lr = learning_rate->data(); - for (size_t i = 0; i < grad_rows.size(); i++) { - PADDLE_ENFORCE(grad_rows[i] < grad_height, - "Input rows index should less than height"); - for (size_t j = 0; j < grad_row_numel; j++) { - out_data[grad_rows[i] * grad_row_numel + j] -= - lr[0] * grad_data[i * grad_row_numel + j]; - } - } + const T *param_data = param->data(); + const T *grad_data = grad_value.data(); + const T *lr = learning_rate->data(); + const int64_t *rows_data = grad_rows.data(); + T *out_data = param_out->mutable_data(ctx.GetPlace()); + + jit::sgd_attr_t attr; + attr.param_height = out_dims[0]; + attr.param_width = param_out->numel() / attr.param_height; + attr.grad_height = grad_rows.size(); // note: it is not grad->height() + attr.grad_width = grad_value.numel() / attr.grad_height; + attr.selected_rows_size = grad_rows.size(); + PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width); + + auto sgd = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); + sgd(lr, param_data, grad_data, rows_data, out_data, &attr); } else { PADDLE_THROW("Unsupported Variable Type of Grad"); } diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index fc3636e0b24765f681d3260b07fe854309774a40..0a0ece162cc63696974383d8ed49fdd10204c331 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -168,9 +168,10 @@ void Pool2dOpMaker::Make() { "be ignored."); // TODO(Chengduo): Add checker. // (Currently, // TypedAttrChecker don't support vector type.) - AddAttr("global_pooling", - "(bool, default false) Whether to use the global pooling. " - "If global_pooling = true, ksize and paddings will be ignored.") + AddAttr( + "global_pooling", + "(bool, default false) Whether to use the global pooling. " + "If global_pooling = true, kernel size and paddings will be ignored.") .SetDefault(false); AddAttr>("strides", "(vector, default {1, 1}), strides(height, " @@ -182,7 +183,7 @@ void Pool2dOpMaker::Make() { "paddings", "(vector, default {0,0}), paddings(height, width) of pooling " "operator." - "If global_pooling = true, paddings and ksize will be ignored.") + "If global_pooling = true, paddings and kernel size will be ignored.") .SetDefault({0, 0}); AddAttr( "exclusive", @@ -204,7 +205,7 @@ void Pool2dOpMaker::Make() { .SetDefault(false); AddAttr( "ceil_mode", - "(bool, default false) Wether to use the ceil function to calculate " + "(bool, default false) Whether to use the ceil function to calculate " "output height and width. False is the default. If it is set to False, " "the floor function will be used.") .SetDefault(false); @@ -262,28 +263,37 @@ Example: For exclusive = false: $$ hstart = i * strides[0] - paddings[0] + $$ + $$ hend = hstart + ksize[0] + $$ + $$ wstart = j * strides[1] - paddings[1] + $$ + $$ wend = wstart + ksize[1] + $$ + $$ Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]} $$ + For exclusive = true: $$ hstart = max(0, i * strides[0] - paddings[0]) + $$ + $$ hend = min(H, hstart + ksize[0]) + $$ + $$ wstart = max(0, j * strides[1] - paddings[1]) + $$ + $$ wend = min(W, wstart + ksize[1]) + $$ + $$ Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)} $$ - For adaptive = true: - $$ - hstart = floor(i * H_{in} / H_{out}) - hend = ceil((i + 1) * H_{in} / H_{out}) - wstart = floor(j * W_{in} / W_{out}) - wend = ceil((j + 1) * W_{in} / W_{out}) - Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)} - $$ )DOC"); } @@ -324,7 +334,7 @@ void Pool3dOpMaker::Make() { AddAttr( "global_pooling", "(bool, default false) Whether to use the global pooling. " - "If global_pooling = true, ksize and paddings wille be ignored.") + "If global_pooling = true, kernel size and paddings will be ignored.") .SetDefault(false); AddAttr>( "strides", @@ -359,7 +369,7 @@ void Pool3dOpMaker::Make() { .SetDefault(false); AddAttr( "ceil_mode", - "(bool, default false) Wether to use the ceil function to calculate " + "(bool, default false) Whether to use the ceil function to calculate " "output height and width. False is the default. If it is set to False, " "the floor function will be used.") .SetDefault(false); @@ -392,48 +402,68 @@ Example: Output: Out shape: $(N, C, D_{out}, H_{out}, W_{out})$ For ceil_mode = false: - $$ - D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ - H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\ - W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1 - $$ + $$ + D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 + $$ + $$ + H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[2]} + 1 + $$ + $$ + W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1 + $$ For ceil_mode = true: - $$ - D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 \\ - H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\ - W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1 - $$ + $$ + D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 + $$ + $$ + H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 + $$ + $$ + W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1 + $$ + For exclusive = false: - $$ - dstart = i * strides[0] - paddings[0] - dend = dstart + ksize[0] - hstart = j * strides[1] - paddings[1] - hend = hstart + ksize[1] - wstart = k * strides[2] - paddings[2] - wend = wstart + ksize[2] - Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]} - $$ + $$ + dstart = i * strides[0] - paddings[0] + $$ + $$ + dend = dstart + ksize[0] + $$ + $$ + hstart = j * strides[1] - paddings[1] + $$ + $$ + hend = hstart + ksize[1] + $$ + $$ + wstart = k * strides[2] - paddings[2] + $$ + $$ + wend = wstart + ksize[2] + $$ + $$ + Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]} + $$ + For exclusive = true: - $$ - dstart = max(0, i * strides[0] - paddings[0]) - dend = min(D, dstart + ksize[0]) - hstart = max(0, j * strides[1] - paddings[1]) - hend = min(H, hstart + ksize[1]) - wstart = max(0, k * strides[2] - paddings[2]) - wend = min(W, wstart + ksize[2]) - Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)} - $$ - - For adaptive = true: - $$ - dstart = floor(i * D_{in} / D_{out}) - dend = ceil((i + 1) * D_{in} / D_{out}) - hstart = floor(j * H_{in} / H_{out}) - hend = ceil((j + 1) * H_{in} / H_{out}) - wstart = floor(k * W_{in} / W_{out}) - wend = ceil((k + 1) * W_{in} / W_{out}) - Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)} - $$ + $$ + dstart = max(0, i * strides[0] - paddings[0]) + $$ + $$ + dend = min(D, dstart + ksize[0]) + $$ + $$ + hend = min(H, hstart + ksize[1]) + $$ + $$ + wstart = max(0, k * strides[2] - paddings[2]) + $$ + $$ + wend = min(W, wstart + ksize[2]) + $$ + $$ + Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)} + $$ )DOC"); } diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index defc29b91f81cb851fec24c5cd9d62dc72c54147..134807092d59329ce93381da67a98b8230db5767 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -13,9 +13,11 @@ // limitations under the License. #include "paddle/fluid/operators/reader/buffered_reader.h" +#include #include #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { namespace reader { @@ -49,8 +51,9 @@ BufferedReader::BufferedReader( .Get(place_))) ->stream(); events.resize(buffer_size); - for (auto &event : events) + for (auto &event : events) { PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + } PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); } #endif @@ -83,12 +86,15 @@ void BufferedReader::ReadAsync(size_t i) { #ifdef PADDLE_WITH_CUDA // NOTE(liangdun): using async copy instead of TensorCopySync - // TensorCopySync would block other stream + // TensorCopySync would block other stream, because TensorCopySync + // issues the copying command to the default stream, it will make two + // commands from different streams cannot run concurrently. if (platform::is_gpu_place(place_)) { platform::SetDeviceId(boost::get(place_).device); PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0)); TensorVec &gpu = gpu_buffer_[i]; gpu.resize(cpu.size()); + platform::RecordEvent record_event("BufferedReader:MemoryCopy"); for (size_t i = 0; i < cpu.size(); ++i) { gpu[i].Resize(cpu[i].dims()); gpu[i].set_layout(cpu[i].layout()); @@ -97,20 +103,20 @@ void BufferedReader::ReadAsync(size_t i) { auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type()); auto size = cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); - if (platform::is_cuda_pinned_place(cpu_place)) + if (platform::is_cuda_pinned_place(cpu_place)) { memory::Copy(boost::get(place_), gpu_ptr, boost::get(cpu_place), cpu_ptr, size, stream); - else if ((platform::is_gpu_place(cpu_place))) + } else if ((platform::is_gpu_place(cpu_place))) { memory::Copy(boost::get(place_), gpu_ptr, boost::get(cpu_place), cpu_ptr, size, stream); - else - // if cpu place is not pinned, async copy is slower than sync copy, - // so we use sync copy instead. + } else { + // TODO(zcd): The default stream should not be used here. memory::Copy(boost::get(place_), gpu_ptr, boost::get(cpu_place), cpu_ptr, size, 0); + } gpu[i].set_lod(cpu[i].lod()); } PADDLE_ENFORCE(cudaStreamSynchronize(stream)); diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc index 8fe638ac2fdc6e0baed7d6cd3c57b72f23164129..846b2ed77e46d82fbeda8faaeed99cddf23c8824 100644 --- a/paddle/fluid/operators/reader/read_op.cc +++ b/paddle/fluid/operators/reader/read_op.cc @@ -85,9 +85,7 @@ class ReadOp : public framework::OperatorBase { std::vector ins; // For profiling - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(dev_place); - platform::RecordEvent record_event(Type(), &ctx); + platform::RecordEvent record_event(Type()); reader->ReadNext(&ins); if (ins.empty()) { diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index a1e02a3fd0e7902e89890f8d3b13159172571f5c..2898a62ddbac524ceb212cac5f34aeda3b1e01cb 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -157,11 +157,13 @@ class RecurrentBase : public framework::OperatorBase { const std::vector &src_vars, framework::Scope *dst_scope, const std::vector &dst_vars, - Callback callback) { + Callback callback, + bool is_backward = false) { PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size()); for (size_t i = 0; i < dst_vars.size(); ++i) { VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i]; - AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback); + AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback, + is_backward); } } @@ -173,11 +175,13 @@ class RecurrentBase : public framework::OperatorBase { const std::vector &src_vars, const framework::Scope &dst_scope, const std::vector &dst_vars, - Callback callback) { + Callback callback, + bool is_backward = false) { PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size()); for (size_t i = 0; i < dst_vars.size(); ++i) { VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i]; - AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback); + AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback, + is_backward); } } @@ -194,9 +198,13 @@ class RecurrentBase : public framework::OperatorBase { static void AccessTensor(const framework::Scope &src_scope, const std::string &src_var_name, framework::Scope *dst_scope, - const std::string &dst_var_name, Callback callback) { + const std::string &dst_var_name, Callback callback, + bool is_backward = false) { auto *src_var = src_scope.FindVar(src_var_name); - PADDLE_ENFORCE(src_var != nullptr); + if (is_backward && src_var == nullptr) { + return; + } + PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name); auto &src_tensor = src_var->Get(); auto *dst_var = dst_scope->Var(dst_var_name); @@ -208,12 +216,16 @@ class RecurrentBase : public framework::OperatorBase { static void AccessTensor(const framework::Scope &src_scope, const std::string &src_var_name, const framework::Scope &dst_scope, - const std::string &dst_var_name, Callback callback) { + const std::string &dst_var_name, Callback callback, + bool is_backward = false) { + auto *dst_var = dst_scope.FindVar(dst_var_name); + if (is_backward && dst_var == nullptr) { + return; + } auto *src_var = src_scope.FindVar(src_var_name); - PADDLE_ENFORCE(src_var != nullptr); + PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name); auto &src_tensor = src_var->Get(); - auto *dst_var = dst_scope.FindVar(dst_var_name); - PADDLE_ENFORCE(dst_var != nullptr); + PADDLE_ENFORCE(dst_var != nullptr, "%s is not found.", dst_var_name); auto *dst_tensor = dst_var->GetMutable(); callback(src_tensor, dst_tensor); } @@ -270,7 +282,9 @@ class RecurrentOp : public RecurrentBase { // Every inputs are linked now, execute! executor.Run(*program, &cur_scope, block->ID(), - false /*create_local_scope*/); + false /*create_local_scope*/, true /*create_vars*/, + std::vector() /*skip_ref_cnt_vars*/, + true /*force_disable_gc*/); // get device context from pool platform::DeviceContextPool &pool = @@ -345,7 +359,8 @@ class RecurrentGradOp : public RecurrentBase { auto dims = framework::vectorize(inside->dims()); dims.erase(dims.begin()); inside->Resize(framework::make_ddim(dims)); - }); + }, + true /*is_backward*/); auto og_set = List2Set(Inputs(kOutputGrads)); if (VLOG_IS_ON(10)) { @@ -385,7 +400,9 @@ class RecurrentGradOp : public RecurrentBase { VLOG(5) << "Recurrent memory linking finished "; // Run step block with cur_scope executor.Run(*program, &cur_scope, block->ID(), - false /*create_local_scope*/); + false /*create_local_scope*/, true /*create_vars*/, + std::vector() /*skip_ref_cnt_vars*/, + true /*force_disable_gc*/); VLOG(5) << "executor.Run finished "; @@ -454,7 +471,8 @@ class RecurrentGradOp : public RecurrentBase { auto dst = outside->Slice(seq_offset, seq_offset + 1); framework::TensorCopy(inside, place, dev_ctx, &dst); - }); + }, + true /*is_backward*/); VLOG(5) << "Link outside gradient finished "; if (step_id + 1 == seq_len) { // at_end @@ -467,7 +485,8 @@ class RecurrentGradOp : public RecurrentBase { outside->Resize(inside.dims()); outside->mutable_data(place, inside.type()); framework::TensorCopy(inside, place, dev_ctx, outside); - }); + }, + true /*is_backward*/); VLOG(5) << "Link initialize state gradient finished "; } scopes.Next(); @@ -608,10 +627,8 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase { std::vector input{kInputs, kInitialStates}; std::vector output{kOutputs}; for (auto &s : input) { + // NOTE(zcd): In some case, some of kInputs doesn't have gradient. PADDLE_ENFORCE(ctx->HasInputs(s)); - PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)), - "Cannot find the gradient variable %s", - framework::GradVarName(s)); } for (auto &s : output) { PADDLE_ENFORCE(ctx->HasInputs(s)); diff --git a/paddle/fluid/operators/requantize_op.cc b/paddle/fluid/operators/requantize_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..08ba1470aaddf146fe3685ff6c3cd9f3d7e16d75 --- /dev/null +++ b/paddle/fluid/operators/requantize_op.cc @@ -0,0 +1,46 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/requantize_op.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +framework::OpKernelType ReQuantOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library_ = framework::LibraryType::kMKLDNN; + framework::DataLayout layout_ = framework::DataLayout::kMKLDNN; + + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_); +} + +void ReQuantOpMaker::Make() { + AddInput("Input", "input data"); + AddOutput("Output", "output data"); + AddAttr("Scale_in", "scale in data").SetDefault({1.0f}); + AddAttr("Scale_out", "scale out data").SetDefault({1.0f}); + AddComment( + R"DOC(This op will re-quantize data from INT8 with scale_in to INT8 with scale_out)DOC"); +} + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OPERATOR(requantize, ops::ReQuantOp, ops::ReQuantOpMaker, + paddle::framework::DefaultGradOpDescMaker); diff --git a/paddle/fluid/operators/requantize_op.h b/paddle/fluid/operators/requantize_op.h new file mode 100644 index 0000000000000000000000000000000000000000..c2b154db11dc713fdce1b9ef2f2616428bc09202 --- /dev/null +++ b/paddle/fluid/operators/requantize_op.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using framework::OpKernelType; +using framework::Tensor; + +class ReQuantOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim("Output", ctx->GetInputDim("Input")); + ctx->ShareLoD("Input", /*->*/ "Output"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class ReQuantOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index eda54f76b898cdf893347d31cadb86dea892a4ce..37f69426b62fedf8cbeca68105fb86fb4ea72eab 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -56,6 +56,9 @@ class ReshapeOp : public framework::OperatorWithKernel { static framework::DDim ValidateShape(const std::vector shape, const framework::DDim &in_dims) { const int64_t in_size = framework::product(in_dims); + auto in_dims_vec = framework::vectorize(in_dims); + bool all_positive = std::all_of(in_dims_vec.cbegin(), in_dims_vec.cend(), + [](int64_t i) { return i > 0; }); // only one dimension can be set to -1, whose size will be automatically // infered. const int64_t unk_dim_val = -1; @@ -88,7 +91,7 @@ class ReshapeOp : public framework::OperatorWithKernel { } if (unk_dim_idx != -1) { - if (in_size > 0) { + if (all_positive) { // in_size < 0 and is un-determinate in compile time, skip the check, // for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8], // capacity = -24, in_size = -8, output_shape[0] = 0 diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a7f7fb26b17c77e6fe87646d3cac20c02c49b52c --- /dev/null +++ b/paddle/fluid/operators/sample_logits_op.cc @@ -0,0 +1,225 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sample_logits_op.h" +#include "paddle/fluid/operators/math/sample_prob.h" + +namespace paddle { +namespace operators { + +class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Logits", + "(Tensor, default: Tensor), The unscaled log probabilities " + "which is a 2-D tensor with shape [N x K]. N is the batch_size, " + "and K is the class number."); + AddInput("Labels", + "(Tensor) The ground truth which is a 2-D tensor. Labels is a " + "Tensor with shape [N x NT], where NT is the number of" + "true labels for each example."); + AddInput("CustomizedSamples", + "(Tensor, default: Tensor), A 2-D tensor with shape [N, " + "NT + S]," + " where N is the batch size, NT is the number of true labels " + "and S is the number of negtive sample for each example." + "The first NT elements of each row should be the same with true " + "labels, " + "followed by S custom negtive samples. This tensor" + "is only used when use_customized_samples is true.") + .AsDispensable(); + AddInput( + "CustomizedProbabilities", + "(Tensor, default: Tensor), A 2-D tensor with shape [N, NT + S]." + "The tensor has the same shape with CustomSamples," + "and each element represents probability of element in CustomSamples. " + "This " + "tensor is only used when use_customized_samples is true.") + .AsDispensable(); + AddOutput("Samples", + "(Tensor, default: Tensor), A 2-D tensor with shape [N, " + "NT + S]." + "The outputs value of sampler, including NT true lables and S " + "negetive samples " + "for each example. This will be used in" + "backward calculation.") + .AsIntermediate(); + AddOutput( + "Probabilities", + "(Tensor, default: Tensor), A 2-D tensor with shape [N, NT + S]." + "The probabilites of sampled positive and negtive labels.") + .AsIntermediate(); + AddOutput("SampledLogits", + "(Tensor, default: Tensor), A 2-D tensor with shape" + "[N, NT + S]. The outputs value of sampled logits, which will be" + "used in backward propagation.") + .AsIntermediate(); + AddOutput( + "SampledLabels", + "(Tensor, default: Tensor), A 2-D tensor. The sampled labels" + "with shape [N, NT]. The tonsor contains hard labels as input to " + " softmax op, that is 0, 1, ..., NT-1 because of the first NT elements" + " of Sampels are positive lables."); + AddAttr( + "use_customized_samples", + "An indicator whether to use customized samples with probabilities, if " + "True" + "the operator will use customized samples and customized probabilities" + "otherwise, the operator will generate them by itself.") + .SetDefault(false); + AddAttr( + "uniq", + "An indicator whether to sample non-repetitive negtive labels, if True" + "the operator will sample negtive labels without replacement." + "Otherwise, the operator will sample negtive labels with replacement.") + .SetDefault(true); + AddAttr( + "remove_accidental_hits", + "An indicator whether to remove accidental hits when samples hits true" + "labels, the removal is implemented by subtracting the corresponding" + "logits by float_max to subpress their softmax to be zero.") + .SetDefault(true); + AddAttr("num_samples", "The number of negative samples."); + AddAttr("seed", "Random seed for generating samples").SetDefault(0); + + AddComment(R"DOC( + """ + Computes sampled output training logits and labels suitable for implementing + sampled softmax. + """ + +)DOC"); + } +}; + +class SampleLogitsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Logits"), + "Input(Logits) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input(Labels) should be not null."); + + PADDLE_ENFORCE(ctx->HasOutput("Samples"), + "Output(Samples) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Probabilities"), + "Output(Probabilities) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("SampledLogits"), + "Output(SampledLogits) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("SampledLabels"), + "Output(SampledLabels) should be not null."); + + auto logits_dims = ctx->GetInputDim("Logits"); + auto labels_dims = ctx->GetInputDim("Labels"); + + PADDLE_ENFORCE_EQ( + logits_dims.size(), 2UL, + "The logits of softmax_with_cross_entropy should be a 2-D tensor."); + PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL, + "The labels should be a 2-D tensor."); + + const int num_samples = ctx->Attrs().Get("num_samples"); + const int num_sampled_classes = labels_dims[1] + num_samples; + ctx->SetOutputDim("Samples", {logits_dims[0], num_sampled_classes}); + ctx->SetOutputDim("Probabilities", {logits_dims[0], num_sampled_classes}); + ctx->SetOutputDim("SampledLogits", {logits_dims[0], num_sampled_classes}); + ctx->SetOutputDim("SampledLabels", {logits_dims[0], labels_dims[1]}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Logits")); + framework::OpKernelType kt = + framework::OpKernelType(data_type, ctx.device_context()); + return kt; + } +}; + +// UNDERSTAND: InferShape for Grad +class SampleLogitsOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Logits"), + "Input(Logits) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input(Labels) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Samples"), + "Input(Samples) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("SampledLogits"), + "Input(SampledLogits) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("SampledLogits")), + "Input(SampledLogits@Grad) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")), + "Output(Logits@Grad) should be not null."); + + auto logit_dims = ctx->GetInputDim("Logits"); + auto label_dims = ctx->GetInputDim("Labels"); + PADDLE_ENFORCE_EQ(label_dims.size(), 2UL, + "The label should be a 2-D tensor."); + PADDLE_ENFORCE_EQ(logit_dims.size(), 2UL, + "The logits should be a 2-D tensor."); + + ctx->SetOutputDim(framework::GradVarName("Logits"), + ctx->GetInputDim("Logits")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar( + ctx.InputVar(framework::GradVarName("SampledLogits"))); + framework::OpKernelType kt = + framework::OpKernelType(data_type, ctx.device_context()); + return kt; + } +}; + +// UNDERSTAND: what's the rule for making a GradMaker TODO +class SampleLogitsGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* grad_op = new framework::OpDesc(); + grad_op->SetType("sample_logits_grad"); + grad_op->SetInput("Logits", Input("Logits")); + grad_op->SetInput("Labels", Input("Labels")); + grad_op->SetInput("Samples", Output("Samples")); + grad_op->SetInput("SampledLogits", Output("SampledLogits")); + grad_op->SetInput(framework::GradVarName("SampledLogits"), + OutputGrad("SampledLogits")); + grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(sample_logits, ops::SampleLogitsOp, ops::SampleLogitsOpMaker, + ops::SampleLogitsGradMaker); +REGISTER_OPERATOR(sample_logits_grad, ops::SampleLogitsOpGrad); +REGISTER_OP_CPU_KERNEL(sample_logits, ops::SampleLogitsKernel, + ops::SampleLogitsKernel); +REGISTER_OP_CPU_KERNEL(sample_logits_grad, ops::SampleLogitsGradKernel, + ops::SampleLogitsGradKernel); diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..fb49793b730f72d66dc846f233bd95ebdab37c52 --- /dev/null +++ b/paddle/fluid/operators/sample_logits_op.cu @@ -0,0 +1,257 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sample_prob.h" +#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/operators/sample_logits_op.h" + +namespace paddle { +namespace operators { + +// UNDERSTAND: something like take_along_axis in numpy. +template +__global__ void GPUTakeAlongD1(size_t size, const int batch_size, + const int array_slice_size, + const int idx_slice_size, const T* p_array, + const int64_t* p_index, T* p_value) { + const auto value_slice_size = idx_slice_size; + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int step_size = blockDim.x * gridDim.x; + + for (; idx < size; idx += step_size) { + int i = idx / idx_slice_size; + auto array_index = p_index[idx]; + p_value[idx] = p_array[i * array_slice_size + array_index]; + } +} + +// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate +// indices, scatter is done in += way. +template +__global__ void GPUPutAlongD1(size_t size, const int batch_size, + const int array_slice_size, + const int idx_slice_size, T* p_array, + const int64_t* p_index, const T* p_value) { + const auto value_slice_size = idx_slice_size; + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int step_size = blockDim.x * gridDim.x; + + // size == batch_size + for (; idx < size; idx += step_size) { + int i = idx; + for (int j = 0; j < idx_slice_size; ++j) { + auto array_index = p_index[i * idx_slice_size + j]; + p_array[i * array_slice_size + array_index] += + p_value[i * idx_slice_size + j]; + } + } +} + +// UNDERSTAND: set label as 0,1,...,num_true-1 +template +__global__ void GPUSetLabel(size_t size, const int num_true, int64_t* p_array) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int step_size = blockDim.x * gridDim.x; + + for (; idx < size; idx += step_size) { + p_array[idx] = idx % num_true; + } +} + +// UNDERSTAND: compute accidentdal hits from samples and minus corresponding +// logits by a float max, here 1e20 +template +__global__ void gpu_compute_remove_accidental_hits(const int size, + const int num_true, + const int idx_slice_size, + const int64_t* p_index, + T* p_value) { + const auto value_slice_size = idx_slice_size; + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int step_size = blockDim.x * gridDim.x; + + for (; idx < size; idx += step_size) { + int i = idx / idx_slice_size; + if (idx % idx_slice_size < num_true) continue; + for (int j = 0; j < num_true; ++j) { + const auto true_idx = i * idx_slice_size + j; + if (p_index[true_idx] == p_index[idx]) { + p_value[idx] -= 1e20; + break; + } + } + } +} + +template +class SampleLogitsCUDAKernel : public framework::OpKernel { + public: + using Tensor = framework::Tensor; + void Compute(const framework::ExecutionContext& context) const override { + // get necessary inputs + const Tensor* logits = context.Input("Logits"); + const Tensor* labels = context.Input("Labels"); + VLOG(3) << "Enter SampleLogitsCUDAKernel"; + + // get necessary outputs + Tensor* samples = context.Output("Samples"); + Tensor* probabilities = context.Output("Probabilities"); + Tensor* sampled_logits = context.Output("SampledLogits"); + Tensor* sampled_labels = context.Output("SampledLabels"); + + // shapes + const auto batch_size = logits->dims()[0]; + const auto num_classes = logits->dims()[1]; + const auto labels_dim = labels->dims(); + const auto num_true = labels_dim[1]; + const auto samples_dim = samples->dims(); + + // attrs + const auto num_samples = context.Attr("num_samples"); + const bool use_customized_samples = + context.Attr("use_customized_samples"); + const bool uniq = context.Attr("uniq"); + const bool remove_accidental_hits = + context.Attr("remove_accidental_hits"); + + // device contexts + auto& dev_ctx = context.cuda_device_context(); + + // UNDERSTAND: allocate memories for temporaries + sampled_logits->mutable_data(samples_dim, context.GetPlace()); + math::SetConstant set_zero; + set_zero(dev_ctx, sampled_logits, static_cast(0)); + + auto sampled_labels_data = + sampled_labels->mutable_data(labels_dim, context.GetPlace()); + int threads = 512; + size_t size = batch_size * num_true; + int grid = (size + threads - 1) / threads; + GPUSetLabel< + T><<>>( + size, num_true, sampled_labels_data); + + if (use_customized_samples) { + const Tensor* customized_samples = + context.Input("CustomizedSamples"); + const Tensor* customized_probabilities = + context.Input("CustomizedProbabilities"); + samples->ShareDataWith(*customized_samples); + probabilities->ShareDataWith(*customized_probabilities); + } else { + samples->mutable_data(context.GetPlace()); + probabilities->mutable_data(samples_dim, context.GetPlace()); + // UNDERSTAND: sampling + const auto seed = context.Attr("seed"); + auto sampler_with_prob = math::GPUSampleWithProb(); + sampler_with_prob(context.cuda_device_context(), seed, num_classes, uniq, + num_samples, labels, samples, probabilities); + } + + // UNDERSTAND: gather sampled logits and remove accidental hits if needed + const auto num_take = samples->dims()[1]; + const auto array_dims = logits->dims(); + const auto idx_dims = samples->dims(); + + const T* p_array = logits->data(); + const int64_t* p_index = samples->data(); + T* p_value = sampled_logits->data(); + + // src slice size + const auto array_slice_size = array_dims[1]; + // index slice size + const auto idx_slice_size = idx_dims[1]; + + size = batch_size * num_take; + grid = (size + threads - 1) / threads; + GPUTakeAlongD1< + T><<>>( + size, batch_size, array_slice_size, idx_slice_size, p_array, p_index, + p_value); + + if (remove_accidental_hits) { + const size_t size = batch_size * (num_true + num_samples); + int grid = (size + threads - 1) / threads; + gpu_compute_remove_accidental_hits< + T><<>>( + size, num_true, idx_slice_size, p_index, p_value); + } + + // subtracted sampled logits with logQ(y|x) + auto probs = EigenMatrix::From(*probabilities); + auto smp_logits = EigenMatrix::From(*sampled_logits); + smp_logits.device(*dev_ctx.eigen_device()) = + (smp_logits - probs.log().unaryExpr(TolerableValue())) + .unaryExpr(TolerableValue()); + } +}; + +template +class SampleLogitsGradCUDAKernel : public framework::OpKernel { + public: + using Tensor = framework::Tensor; + void Compute(const framework::ExecutionContext& context) const override { + auto logits_grad = context.Output(framework::GradVarName("Logits")); + const Tensor* samples = context.Input("Samples"); + const Tensor* sampled_logits_grad = + context.Input(framework::GradVarName("SampledLogits")); + logits_grad->mutable_data(context.GetPlace()); + + auto& dev_ctx = context.cuda_device_context(); + math::SetConstant set_zero; + set_zero(dev_ctx, logits_grad, static_cast(0)); + + // UNDERSTAND: scatter it back to logit_grad + const auto batch_size = samples->dims()[0]; + const auto num_put = samples->dims()[1]; + const auto array_dims = logits_grad->dims(); + const auto idx_dims = samples->dims(); + + T* p_array = logits_grad->data(); + const int64_t* p_index = samples->data(); + const T* p_value = sampled_logits_grad->data(); + + // src slice size + const auto array_slice_size = array_dims[1]; + // index slice size + const auto idx_slice_size = idx_dims[1]; + + int threads = 128; + const size_t size = batch_size; + int grid = (size + threads - 1) / threads; + + GPUPutAlongD1< + T><<>>( + size, batch_size, array_slice_size, idx_slice_size, p_array, p_index, + p_value); + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL(sample_logits, ops::SampleLogitsCUDAKernel, + ops::SampleLogitsCUDAKernel); +REGISTER_OP_CUDA_KERNEL(sample_logits_grad, + ops::SampleLogitsGradCUDAKernel, + ops::SampleLogitsGradCUDAKernel); diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h new file mode 100644 index 0000000000000000000000000000000000000000..b55a24863cc09d5f80e07aedbbb5b3d9ac99e69e --- /dev/null +++ b/paddle/fluid/operators/sample_logits_op.h @@ -0,0 +1,245 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sample_prob.h" +#include "paddle/fluid/operators/math/softmax.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +template +struct TolerableValue { + HOSTDEVICE T operator()(const T& x) const { + PADDLE_ASSERT(std::is_floating_point::value); + const T kApproInf = 1e20; + if (x == INFINITY) return kApproInf; + if (x == -INFINITY) return -kApproInf; + return x; + } +}; + +// UNDERSTAND: something like take_along_axis in numpy. +template +static void CPUTakeAlongD1(const platform::DeviceContext& ctx, + const framework::Tensor& array, + const framework::Tensor& index, + framework::Tensor* value) { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); + // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K) + PADDLE_ENFORCE(index.dims().size() == 2 && array.dims().size() == 2 && + index.dims()[0] == array.dims()[0] && + index.dims() == value->dims()); + + const auto batch_size = index.dims()[0]; + const auto num_take = index.dims()[1]; + const auto array_dims = array.dims(); + const auto idx_dims = index.dims(); + + // UNDERSTAND: no allocations here + const T* p_array = array.data(); + const int64_t* p_index = index.data(); + T* p_value = value->data(); + + // src slice size + const auto array_slice_size = array_dims[1]; + + // index slice size + const auto idx_slice_size = idx_dims[1]; + const auto value_slice_size = idx_slice_size; + + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < num_take; ++j) { + auto array_index = p_index[i * idx_slice_size + j]; + p_value[i * value_slice_size + j] = + p_array[i * array_slice_size + array_index]; + } + } +} + +// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate +// indices, scatter is done in += way. +template +static void CPUPutAlongD1(const platform::DeviceContext& ctx, + framework::Tensor* array, + const framework::Tensor& index, + const framework::Tensor& value) { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); + // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K) + PADDLE_ENFORCE(index.dims().size() == 2 && array->dims().size() == 2 && + index.dims()[0] == array->dims()[0] && + index.dims() == value.dims()); + const auto batch_size = index.dims()[0]; + const auto num_put = index.dims()[1]; + auto array_dims = array->dims(); + auto idx_dims = index.dims(); + + // UNDERSTAND: no allocations here + T* p_array = array->data(); + const int64_t* p_index = index.data(); + const T* p_value = value.data(); + + // slice sizes + const auto array_slice_size = array_dims[1]; + const auto idx_slice_size = idx_dims[1]; + const auto value_slice_size = idx_slice_size; + + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < num_put; ++j) { + auto array_index = p_index[i * idx_slice_size + j]; + p_array[i * array_slice_size + array_index] += + p_value[i * value_slice_size + j]; + } + } +} + +// UNDERSTAND: compute accidentdal hits from samples and minus corresponding +// logits by a float max, here 1e20 +template +static void compute_remove_accidental_hits(const platform::DeviceContext& ctx, + framework::Tensor* sampled_logits, + const framework::Tensor& samples, + const int num_true) { + const auto batch_size = sampled_logits->dims()[0]; + const auto num_sampled_classes = sampled_logits->dims()[1]; + T* sampled_logits_data = sampled_logits->data(); + const auto samples_data = samples.data(); + + std::unordered_set tmp_true_labels; + for (int i = 0; i < batch_size; ++i) { + tmp_true_labels.clear(); + tmp_true_labels.insert(samples_data + i * num_sampled_classes, + samples_data + i * num_sampled_classes + num_true); + for (int j = num_true; j < num_sampled_classes; ++j) { + const auto idx = i * num_sampled_classes + j; + if (tmp_true_labels.find(samples_data[idx]) != tmp_true_labels.end()) + sampled_logits_data[idx] -= 1e20; + } + } +} + +template +class SampleLogitsKernel : public framework::OpKernel { + public: + using Tensor = framework::Tensor; + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()), + "This kernel only runs on CPU."); + VLOG(3) << "Enter SampleLogitsKernel"; + // get necessary inputs + const Tensor* logits = context.Input("Logits"); + const Tensor* labels = context.Input("Labels"); + + // get necessary outputs + Tensor* samples = context.Output("Samples"); + Tensor* probabilities = context.Output("Probabilities"); + Tensor* sampled_logits = context.Output("SampledLogits"); + Tensor* sampled_labels = context.Output("SampledLabels"); + + // shapes + const auto batch_size = logits->dims()[0]; + const auto num_classes = logits->dims()[1]; + const auto labels_dim = labels->dims(); + const auto num_true = labels_dim[1]; + const auto samples_dim = samples->dims(); + + // attrs + const auto num_samples = context.Attr("num_samples"); + const bool use_customized_samples = + context.Attr("use_customized_samples"); + const bool remove_accidental_hits = + context.Attr("remove_accidental_hits"); + + // device contexts + auto& dev_ctx = + context.template device_context(); + + // UNDERSTAND: allocate memories for temporaries + sampled_logits->mutable_data(samples_dim, context.GetPlace()); + auto sampled_labels_data = + sampled_labels->mutable_data(labels_dim, context.GetPlace()); + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < num_true; ++j) { + sampled_labels_data[i * num_true + j] = j; + } + } + + if (use_customized_samples) { + const Tensor* customized_samples = + context.Input("CustomizedSamples"); + const Tensor* customized_probabilities = + context.Input("CustomizedProbabilities"); + samples->ShareDataWith(*customized_samples); + probabilities->ShareDataWith(*customized_probabilities); + } else { + samples->mutable_data(context.GetPlace()); + probabilities->mutable_data(samples_dim, context.GetPlace()); + // UNDERSTAND: sampling + const auto seed = context.Attr("seed"); + auto sampler_with_prob = + math::SampleWithProb(); + sampler_with_prob(dev_ctx, math::LogUniformSampler(num_classes, seed), + num_samples, labels, samples, probabilities); + } + + // UNDERSTAND: gather sampled logits and remove accidental hits if needed + CPUTakeAlongD1(dev_ctx, *logits, *samples, sampled_logits); + if (remove_accidental_hits) { + compute_remove_accidental_hits(dev_ctx, sampled_logits, *samples, + num_true); + } + + // subtracted sampled logits with logQ(y|x) + auto probs = EigenMatrix::From(*probabilities); + auto smp_logits = EigenMatrix::From(*sampled_logits); + smp_logits.device(*dev_ctx.eigen_device()) = + (smp_logits - probs.log().unaryExpr(TolerableValue())) + .unaryExpr(TolerableValue()); + } +}; + +template +class SampleLogitsGradKernel : public framework::OpKernel { + public: + using Tensor = framework::Tensor; + void Compute(const framework::ExecutionContext& context) const override { + auto logits_grad = context.Output(framework::GradVarName("Logits")); + const Tensor* samples = context.Input("Samples"); + const Tensor* sampled_logits_grad = + context.Input(framework::GradVarName("SampledLogits")); + logits_grad->mutable_data(context.GetPlace()); + + auto& dev_ctx = + context.template device_context(); + math::SetConstant set_zero; + set_zero(dev_ctx, logits_grad, static_cast(0)); + + // UNDERSTAND: scatter it back to logit_grad + CPUPutAlongD1(dev_ctx, logits_grad, *samples, *sampled_logits_grad); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc index 1eebadc2c980ddf1cbaaefef1568dd401d0c77ed..d3dcd1f96a986d2450c8af780a12183f7dfc66d5 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc @@ -22,6 +22,9 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { + if (ctx->IsRuntime()) { + return; + } PADDLE_ENFORCE( ctx->HasInput("X"), "Input(X) of SequecceEnumerate operator should not be null."); @@ -31,11 +34,11 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel { const auto x_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE_EQ( - x_dims.size(), 2UL, + x_dims.size(), 2, "Input(X) of SequenceEnumerate operator's rank should be 2."); - PADDLE_ENFORCE_EQ( - x_dims[1], 1UL, - "Input(X) of SequenceEnumerate operator's 2nd dimension should be 1."); + PADDLE_ENFORCE_EQ(x_dims[1], 1, + "Input(X) of SequenceEnumerate operator's 2nd " + "dimension should be 1."); const auto win_size = ctx->Attrs().Get("win_size"); ctx->SetOutputDim("Out", {x_dims[0], win_size}); diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu index 28821e7129c1601f1214b0b56696fbf526a2123f..d5deb7582c7c00f3102ea568a716b715611212ce 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu @@ -65,6 +65,7 @@ class SequenceEnumerateOpCUDAKernel : public framework::OpKernel { auto lod0 = in_lod[0]; auto in_len = in->numel(); auto in_data = in->data(); + out->Resize({in_dims[0], win_size}); auto out_data = out->mutable_data(context.GetPlace()); // Copy LoD to GPU const size_t* dev_in_lod_ptr = lod0.CUDAData(context.GetPlace()); @@ -72,6 +73,7 @@ class SequenceEnumerateOpCUDAKernel : public framework::OpKernel { CalcOutPut<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( in_data, dev_in_lod_ptr, lod0.size(), win_size, pad_value, out_data); + out->set_lod(in->lod()); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h index dc18d9b2071303377505155476b87ed029eaf986..18da69993b2ad5879dd4678ec0d4b06d7e30cb0a 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h @@ -39,6 +39,7 @@ class SequenceEnumerateKernel : public framework::OpKernel { // Generate enumerate sequence set auto lod0 = in_lod[0]; auto in_data = in->data(); + out->Resize({in_dims[0], win_size}); auto out_data = out->mutable_data(context.GetPlace()); for (size_t i = 0; i < lod0.size() - 1; ++i) { for (size_t idx = lod0[i]; idx < lod0[i + 1]; ++idx) { @@ -49,6 +50,7 @@ class SequenceEnumerateKernel : public framework::OpKernel { } } } + out->set_lod(in->lod()); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu index 619c40dbd10ad6b538f2d4e3567966b222fc5e2d..0401c22c92e1a9be35c2ff6b2c7e95924afe3f1b 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu @@ -64,8 +64,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel { auto* out = ctx.Output("Out"); auto lod = in->lod(); - PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); - PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(), + PADDLE_ENFORCE_EQ(lod[lod.size() - 1].back(), (size_t)in->numel(), "The actual size mismatches with the LoD information."); auto tokens = ctx.Attr>("tokens"); auto in_len = in->numel(); @@ -85,10 +84,9 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel { num_erased.begin() + 1); // Copy LoD to GPU - auto lod0 = lod[0]; - auto lod_len = lod0.size(); - const size_t* dev_in_lod_ptr = lod0.CUDAData(ctx.GetPlace()); - + auto last_lod = lod[lod.size() - 1]; + auto lod_len = last_lod.size(); + const size_t* dev_in_lod_ptr = last_lod.CUDAData(ctx.GetPlace()); // Calc output LoD thrust::device_vector dev_out_lod(lod_len); size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data()); @@ -96,13 +94,16 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel { PADDLE_CUDA_NUM_THREADS, 0, stream>>>( num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr); // Set LoD for output - std::vector out_lod0(dev_out_lod.begin(), dev_out_lod.end()); + std::vector out_last_lod(dev_out_lod.begin(), dev_out_lod.end()); framework::LoD out_lod; - out_lod.push_back(out_lod0); + for (size_t i = 0; i < lod.size() - 1; ++i) { + out_lod.push_back(lod[i]); + } + out_lod.push_back(out_last_lod); out->set_lod(out_lod); // Set output - out->Resize({static_cast(out_lod0.back()), 1}); + out->Resize({static_cast(out_last_lod.back()), 1}); auto out_dat = out->mutable_data(ctx.GetPlace()); SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len, diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.h b/paddle/fluid/operators/sequence_ops/sequence_erase_op.h index 265390528a15aa060900276f98128d754fc907fe..af5a64dce5d2484ad9006f0c30e8851746794f38 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.h @@ -28,19 +28,18 @@ class SequenceEraseKernel : public framework::OpKernel { auto* out = ctx.Output("Out"); auto lod = in->lod(); - PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); - PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(), + PADDLE_ENFORCE_EQ(lod[lod.size() - 1].back(), (size_t)in->numel(), "The actual size mismatches with the LoD information."); auto tokens = ctx.Attr>("tokens"); auto in_len = in->numel(); auto in_dat = in->data(); - auto lod0 = lod[0]; + auto last_lod = lod[lod.size() - 1]; std::vector num_erased(in_len + 1, 0); - std::vector out_lod0(1, 0); - for (size_t i = 0; i < lod0.size() - 1; ++i) { + std::vector out_last_lod(1, 0); + for (size_t i = 0; i < last_lod.size() - 1; ++i) { size_t num_out = 0; - for (auto j = lod0[i] + 1; j <= lod0[i + 1]; ++j) { + for (auto j = last_lod[i] + 1; j <= last_lod[i + 1]; ++j) { num_erased[j] = num_erased[j - 1]; if (std::find(tokens.begin(), tokens.end(), in_dat[j - 1]) != tokens.end()) { @@ -49,7 +48,7 @@ class SequenceEraseKernel : public framework::OpKernel { num_out += 1; } } - out_lod0.push_back(out_lod0.back() + num_out); + out_last_lod.push_back(out_last_lod.back() + num_out); } auto out_len = in_len - num_erased[in_len]; @@ -62,7 +61,10 @@ class SequenceEraseKernel : public framework::OpKernel { } } framework::LoD out_lod; - out_lod.push_back(out_lod0); + for (size_t i = 0; i < lod.size() - 1; ++i) { + out_lod.push_back(lod[i]); + } + out_lod.push_back(out_last_lod); out->set_lod(out_lod); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc index 27e0201bd70df59c58eaa7567d5bb69eb1b721b4..f6c42415301bc8d6f3509bfba2ff356265643bad 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc @@ -48,10 +48,10 @@ class SequenceExpandOp : public framework::OperatorWithKernel { auto& x_lod = x_var->Get().lod(); auto& y_lod = y_var->Get().lod(); - PADDLE_ENFORCE_LE(x_lod.size(), 1, + PADDLE_ENFORCE_LE(x_lod.size(), 1UL, "Level number of Input(X)'s lod should not be " "greater than 1."); - PADDLE_ENFORCE_GT(y_lod.size(), 0, + PADDLE_ENFORCE_GT(y_lod.size(), 0UL, "Level number of Input(Y)'s lod should be " "greater than 0."); PADDLE_ENFORCE( @@ -69,7 +69,8 @@ class SequenceExpandOp : public framework::OperatorWithKernel { "size of Input(X)'s first level lod should be equal to " "size of Input(Y)'s referred level lod."); } else { - PADDLE_ENFORCE_EQ(x_dims[0], y_lod[ref_level].size() - 1, + PADDLE_ENFORCE_EQ(x_dims[0], + static_cast(y_lod[ref_level].size()) - 1, "When Input(X)'s lod is null, the dims[0] of " "Input(X) should match the " "size of Input(Y)'s referred level lod."); diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu index afc08c7b3f6596efd3b6e0b74c17aa3c9268c47d..888d1a12e6751eeb91f0af04b50cf6d5bea74162 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -88,6 +89,49 @@ void GetOutputOffset(const framework::Vector& x_lod, } } +template +static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context, + const LoDTensor& x, LoDTensor* out, + const framework::Vector& x_lod, + const framework::Vector& ref_lod, + bool do_copy) { + auto out_data = out->data(); + auto x_data = x.data(); + + auto& gpu_place = boost::get(context.GetPlace()); + + int x_item_length = x.numel() / x.dims()[0]; + int out_offset = 0; + int num_copys = 0; + for (size_t i = 1; i < ref_lod.size(); ++i) { + int repeat_num = ref_lod[i] - ref_lod[i - 1]; + int x_start = x_lod[i - 1]; + int x_end = x_lod[i]; + int x_seq_len = x_end - x_start; + if (repeat_num > 0) { + if (do_copy) { + int out_start = out_offset; + if (out->lod().size() == 1) { + out_start = out->lod()[0][out_offset]; + } + for (int j = 0; j < repeat_num; j++) { + for (int k = 0; k < x_seq_len; k++) { + memory::Copy( + gpu_place, + out_data + (out_start + j * x_seq_len + k) * x_item_length, + gpu_place, x_data + (x_start + k) * x_item_length, + sizeof(T) * x_item_length, context.stream()); + } + } + } else { + num_copys += repeat_num * x_seq_len; + } + } + out_offset += repeat_num; + } + return num_copys; +} + template struct SequenceExpandFunctor { void operator()( @@ -95,22 +139,40 @@ struct SequenceExpandFunctor { const framework::Vector& x_lod, /*expand source lod*/ const framework::Vector& ref_lod, /*expand referenced lod*/ LoDTensor* out) { - int x_item_length = x.numel() / x.dims()[0]; - framework::Vector out_offset(x_lod.size()); - GetOutputOffset(x_lod, ref_lod, &out_offset); - - int thread_x = std::min(32, std::max(static_cast(ref_lod.size()), 16)); - int thread_y = 16; - int thread_z = 1024 / thread_x / thread_y; - int block_x = static_cast(ref_lod.size()); - dim3 block_size(thread_x, thread_y, thread_z); - dim3 grid_size(block_x, 1); + int num_copys = + ExpandByMemoryCopy(context, x, out, x_lod, ref_lod, false); + // Sometimes direct copies will be faster, this maybe need deeply analysis. + if (num_copys < 5) { + ExpandByMemoryCopy(context, x, out, x_lod, ref_lod, true); + } else { + int x_item_length = x.numel() / x.dims()[0]; + size_t x_lod_size = x_lod.size(); + framework::Vector out_offset(x_lod_size * 2 + ref_lod.size()); + GetOutputOffset(x_lod, ref_lod, &out_offset); + + for (size_t i = 0; i < x_lod_size; ++i) { + out_offset[x_lod_size + i] = x_lod[i]; + } + for (size_t i = 0; i < ref_lod.size(); ++i) { + out_offset[2 * x_lod_size + i] = ref_lod[i]; + } - sequence_expand_kernel<<>>( - x.data(), x_lod.CUDAData(context.GetPlace()), - ref_lod.CUDAData(context.GetPlace()), - out_offset.CUDAData(context.GetPlace()), x_lod.size(), x_item_length, - out->mutable_data(context.GetPlace())); + const size_t* out_offset_data = out_offset.CUDAData(context.GetPlace()); + const size_t* x_lod_data = out_offset_data + x_lod_size; + const size_t* ref_lod_data = out_offset_data + 2 * x_lod_size; + + int thread_x = + std::min(32, std::max(static_cast(ref_lod.size()), 16)); + int thread_y = 16; + int thread_z = 1024 / thread_x / thread_y; + int block_x = static_cast(ref_lod.size()); + dim3 block_size(thread_x, thread_y, thread_z); + dim3 grid_size(block_x, 1); + + sequence_expand_kernel<<>>( + x.data(), x_lod_data, ref_lod_data, out_offset_data, x_lod_size, + x_item_length, out->mutable_data(context.GetPlace())); + } } }; diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc index 1be9fe47af71d31ce2e0eba807ea4a43601f8aca..efc497fa47d1d954bbd1e214b43f5de4c76b0714 100644 --- a/paddle/fluid/operators/shape_op.cc +++ b/paddle/fluid/operators/shape_op.cc @@ -35,14 +35,15 @@ class ShapeOp : public framework::OperatorWithKernel { class ShapeOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("Input", "(Tensor), The input tensor."); - AddOutput("Out", - "(Tensor), The shape of input tensor, the data type of the shape" - " is int32_t, will be on the same device with the input Tensor."); + AddInput("Input", "(LoDTensor), The input tensor."); + AddOutput( + "Out", + "(LoDTensor), The shape of input tensor, the data type of the shape" + " is int32_t, will be on the same device with the input Tensor."); AddComment(R"DOC( -Shape Operator +Shape Operator. -Get the shape of input tensor. Only support CPU input Tensor now. +Return the shape of the input. )DOC"); } }; diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index 0397c7791e1768393ff642743d2f7085b25fb551..7754d2bfebdbc81e25432641b2eb4315386f75ff 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -46,10 +46,10 @@ class SoftmaxWithCrossEntropyOpMaker .SetDefault(false); AddAttr( "numeric_stable_mode", - "(bool, default: false), A flag to indicate whether to use more " + "(bool, default: true), A flag to indicate whether to use more " "numerically stable algorithm. This flag is only valid when " "soft_label is false and GPU is used.") - .SetDefault(false); + .SetDefault(true); AddAttr( "ignore_index", "(int, default -100), Specifies a target value that is ignored and" diff --git a/paddle/fluid/operators/spectral_norm_op.cc b/paddle/fluid/operators/spectral_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..357d055756523cd83bf0e4b30719155b32c65974 --- /dev/null +++ b/paddle/fluid/operators/spectral_norm_op.cc @@ -0,0 +1,197 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/spectral_norm_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class SpectralNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) of SpectralNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("U"), + "Input(U) of SpectralNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("V"), + "Input(V) of SpectralNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SpectralNormOp should not be null."); + + auto dim_weight = ctx->GetInputDim("Weight"); + auto rank_weight = dim_weight.size(); + PADDLE_ENFORCE(rank_weight >= 2 && rank_weight <= 5, + "The rank of Input(Weights) can only be 2, 3," + "4, 5 for fc, conv1d, conv2d, conv3d layers."); + + int dim = ctx->Attrs().Get("dim"); + int power_iters = ctx->Attrs().Get("power_iters"); + PADDLE_ENFORCE(dim == 0 || dim == 1, "Attr(dim) can only be 0 or 1"); + PADDLE_ENFORCE(power_iters >= 0, + "Attr(power_iters) should be larger equal then 0"); + + int h = dim_weight[dim]; + int w = 1; + for (int i = 0; i < rank_weight; i++) { + if (i != dim) { + w *= dim_weight[i]; + } + } + auto dim_u = ctx->GetInputDim("U"); + auto dim_v = ctx->GetInputDim("V"); + PADDLE_ENFORCE_EQ(dim_u[0], h, + "Input(U) dims[0] should be equal to " + "Input(Weight) dims[Attr(dim)]"); + PADDLE_ENFORCE_EQ( + dim_v[0], w, + "Input(V) dims[0] should be equal to " + "the product of Input(Weight) dims except dims[Attr(dim)]"); + + ctx->SetOutputDim("Out", dim_weight); + ctx->ShareLoD("Weight", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("Weight")->type(), + ctx.GetPlace()); + } +}; + +class SpectralNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Weight", + "The input weight tensor of spectral_norm operator, " + "This can be a 2-D, 3-D, 4-D, 5-D tensor which is the " + "weights of fc, conv1d, conv2d, conv3d layer."); + AddInput("U", + "The weight_u tensor of spectral_norm operator, " + "This can be a 1-D tensor in shape [H, 1]," + "H is the 1st dimentions of Weight after reshape" + "corresponding by Attr(dim). As for Attr(dim) = 1" + "in conv2d layer with weight shape [M, C, K1, K2]" + "Weight will be reshape to [C, M*K1*K2], U will" + "be in shape [C, 1]."); + AddInput("V", + "The weight_v tensor of spectral_norm operator, " + "This can be a 1-D tensor in shape [W, 1], " + "W is the 2nd dimentions of Weight after reshape " + "corresponding by Attr(dim). As for Attr(dim) = 1 " + "in conv2d layer with weight shape [M, C, K1, K2] " + "Weight will be reshape to [C, M*K1*K2], V will " + "be in shape [M*K1*K2, 1]."); + AddOutput("Out", + "The output weight tensor of spectral_norm operator, " + "This tensor is in same shape with Input(Weight)."); + + AddAttr("dim", + "The index of dimension which should be permuted " + "to the first before reshaping Input(Weight) to " + "matrix, it should be set as 0 if Input(Weight) is " + "the weight of fc layer, and should be set as 1 if " + "Input(Weight) is the weight of conv layer, " + "default 0.") + .SetDefault(0); + AddAttr("power_iters", + "number of power iterations to calculate " + "spectral norm, default 1.") + .SetDefault(1); + AddAttr("eps", + "epsilon for numerical stability in " + "calculating norms") + .SetDefault(1e-12); + + AddComment(R"DOC( + This layer calculates the spectral normalization value of weight of + fc, conv1d, conv2d, conv3d layers which should be 2-D, 3-D, 4-D, 5-D + tensor. + + Spectral normalization stabilizes the training of critic in GANs + (Generative Adversarial Networks). This layer rescaling weight tensor + with spectral normalize value. + + For spectral normalization calculations, we rescaling weight + tensor with :math:`\sigma`, while :math:`\sigma{\mathbf{W}}` is + + $$\sigma(\mathbf{W}) = \max_{\mathbf{h}: \mathbf{h} \ne 0} \\frac{\|\mathbf{W} \mathbf{h}\|_2}{\|\mathbf{h}\|_2}$$ + + We calculate :math:`\sigma{\mathbf{W}}` through power iterations as + + $$ + \mathbf{v} = \mathbf{W}^{T} \mathbf{u} + $$ + $$ + \mathbf{v} = \\frac{\mathbf{v}}{\|\mathbf{v}\|_2} + $$ + $$ + \mathbf{u} = \mathbf{W}^{T} \mathbf{v} + $$ + $$ + \mathbf{u} = \\frac{\mathbf{u}}{\|\mathbf{u}\|_2} + $$ + + And :math:`\sigma` should be + + $$\sigma{\mathbf{W}} = \mathbf{u}^{T} \mathbf{W} \mathbf{v}$$ + + For details of spectral normalization, please refer to paper: + `Spectral Normalization `_ . + )DOC"); + } +}; + +class SpectralNormOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Weight"), "Input(Weight) should not be null"); + PADDLE_ENFORCE(ctx->HasInput("U"), "Input(U) should not be null"); + PADDLE_ENFORCE(ctx->HasInput("V"), "Input(V) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto dim_x = ctx->GetInputDim("Weight"); + if (ctx->HasOutput(framework::GradVarName("Weight"))) { + ctx->SetOutputDim(framework::GradVarName("Weight"), dim_x); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("Weight")->type(), + ctx.GetPlace()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(spectral_norm, ops::SpectralNormOp, ops::SpectralNormOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(spectral_norm_grad, ops::SpectralNormOpGrad); +REGISTER_OP_CPU_KERNEL( + spectral_norm, + ops::SpectralNormKernel, + ops::SpectralNormKernel); +REGISTER_OP_CPU_KERNEL( + spectral_norm_grad, + ops::SpectralNormGradKernel, + ops::SpectralNormGradKernel); diff --git a/paddle/fluid/operators/spectral_norm_op.cu b/paddle/fluid/operators/spectral_norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ea90e3b4c122b00d5bfe13617e48a9bbe0ee8395 --- /dev/null +++ b/paddle/fluid/operators/spectral_norm_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/spectral_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + spectral_norm, + ops::SpectralNormKernel, + ops::SpectralNormKernel); +REGISTER_OP_CUDA_KERNEL( + spectral_norm_grad, + ops::SpectralNormGradKernel, + ops::SpectralNormGradKernel); diff --git a/paddle/fluid/operators/spectral_norm_op.h b/paddle/fluid/operators/spectral_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..eb48e3b7840e18efe809540dd697f243a0a63a52 --- /dev/null +++ b/paddle/fluid/operators/spectral_norm_op.h @@ -0,0 +1,273 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +using EigenTensor = framework::EigenTensor; +using Tensor = framework::Tensor; + +using Array1 = Eigen::DSizes; +using Array2 = Eigen::DSizes; +using IndexPair = Eigen::IndexPair; + +template +static inline void TransCompute(const int rank, const Tensor& in, Tensor* out, + const std::vector& perm, + const DeviceContext& dev_ctx) { + if (rank <= 1 || rank > 5) { + PADDLE_THROW("Invalid weight rank."); + } + + switch (rank) { + case 2: + math::Transpose trans2; + trans2(dev_ctx, in, out, perm); + break; + case 3: + math::Transpose trans3; + trans3(dev_ctx, in, out, perm); + break; + case 4: + math::Transpose trans4; + trans4(dev_ctx, in, out, perm); + break; + case 5: + math::Transpose trans5; + trans5(dev_ctx, in, out, perm); + break; + default: + break; + } +} + +template +static inline void CalcMatrixSigmaAndNormWeight( + Tensor* sigma, Tensor* u, Tensor* v, Tensor* weight, const int power_iters, + const float eps, const framework::ExecutionContext& ctx) { + auto& place = *ctx.template device_context().eigen_device(); + auto blas = math::GetBlas(ctx); + auto sigma_t = EigenTensor::From(*sigma); + auto weight_t = EigenTensor::From(*weight); + auto u_t = EigenTensor::From(*u); + auto v_t = EigenTensor::From(*v); + + const int h = weight->dims()[0]; + const int w = weight->dims()[1]; + + for (int i = 0; i < power_iters; i++) { + // V = W^T * U / ||W^T * U||_2 + blas.MatMul(*weight, true, *u, false, T(1), v, T(0)); + auto v_t_norm = + v_t.square().sum().sqrt().eval().reshape(Array1(1)).broadcast( + Array1(w)); + v_t.device(place) = v_t / (v_t_norm + v_t_norm.constant(eps)); + // U = W^T * V / ||W^T * V||_2 + blas.MatMul(*weight, false, *v, false, T(1), u, T(0)); + auto u_t_norm = + u_t.square().sum().sqrt().eval().reshape(Array1(1)).broadcast( + Array1(h)); + u_t.device(place) = u_t / (u_t_norm + u_t_norm.constant(eps)); + } + Tensor weight_v; + weight_v.mutable_data({h, 1}, ctx.GetPlace()); + blas.MatMul(*weight, false, *v, false, T(1), &weight_v, T(0)); + auto weight_v_t = EigenTensor::From(weight_v); + sigma_t.device(place) = (u_t * weight_v_t) + .sum() + .eval() + .reshape(Array2(1, 1)) + .broadcast(Array2(h, w)); + weight_t.device(place) = weight_t / sigma_t; +} + +template +class SpectralNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + auto weight = ctx.Input("Weight"); + auto u = ctx.Input("U"); + auto v = ctx.Input("V"); + auto out = ctx.Output("Out"); + + int dim = ctx.Attr("dim"); + int power_iters = ctx.Attr("power_iters"); + float eps = ctx.Attr("eps"); + + const int h = u->dims()[0]; + const int w = v->dims()[0]; + + Tensor weight_mat; + auto dims = weight->dims(); + const int rank = dims.size(); + std::vector real_dims; + if (dim != 0) { + std::vector perm; + perm.push_back(dim); + real_dims.push_back(dims[dim]); + for (int i = 0; i < rank; i++) { + if (i != dim) { + perm.push_back(i); + real_dims.push_back(dims[i]); + } + } + weight_mat.mutable_data(framework::make_ddim(real_dims), + ctx.GetPlace()); + TransCompute(rank, *weight, &weight_mat, perm, dev_ctx); + } else { + for (int i = 0; i < rank; i++) { + real_dims.push_back(i); + } + TensorCopySync(*weight, ctx.GetPlace(), &weight_mat); + } + weight_mat = weight_mat.Resize({h, w}); + + Tensor sigma; + sigma.mutable_data(weight_mat.dims(), ctx.GetPlace()); + Tensor uu, vv; + TensorCopySync(*u, ctx.GetPlace(), &uu); + TensorCopySync(*v, ctx.GetPlace(), &vv); + CalcMatrixSigmaAndNormWeight( + &sigma, &(uu.Resize({h, 1})), &(vv.Resize({w, 1})), &weight_mat, + power_iters, eps, ctx); + + if (dim != 0) { + std::vector perm; + for (int i = 0; i < rank; i++) { + if (i < dim) { + perm.push_back(i + 1); + } else if (i == dim) { + perm.push_back(0); + } else { + perm.push_back(i); + } + } + out->mutable_data(dims, ctx.GetPlace()); + TransCompute( + rank, weight_mat.Resize(framework::make_ddim(real_dims)), out, perm, + dev_ctx); + } else { + TensorCopySync(weight_mat.Resize(dims), ctx.GetPlace(), out); + } + } +}; + +template +class SpectralNormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& place = *ctx.template device_context().eigen_device(); + auto& dev_ctx = ctx.template device_context(); + auto blas = math::GetBlas(ctx); + auto weight = ctx.Input("Weight"); + auto u = ctx.Input("U"); + auto v = ctx.Input("V"); + auto out_grad = ctx.Input(framework::GradVarName("Out")); + auto weight_grad = ctx.Output(framework::GradVarName("Weight")); + + int dim = ctx.Attr("dim"); + int power_iters = ctx.Attr("power_iters"); + float eps = ctx.Attr("eps"); + + const int h = u->dims()[0]; + const int w = v->dims()[0]; + + Tensor weight_mat, out_grad_mat; + auto dims = weight->dims(); + const int rank = dims.size(); + std::vector real_dims; + if (dim != 0) { + std::vector perm; + perm.push_back(dim); + real_dims.push_back(dims[dim]); + for (int i = 0; i < rank; i++) { + if (i != dim) { + perm.push_back(i); + real_dims.push_back(dims[i]); + } + } + weight_mat.mutable_data(framework::make_ddim(real_dims), + ctx.GetPlace()); + out_grad_mat.mutable_data(framework::make_ddim(real_dims), + ctx.GetPlace()); + TransCompute(rank, *weight, &weight_mat, perm, dev_ctx); + TransCompute(rank, *out_grad, &out_grad_mat, perm, + dev_ctx); + } else { + for (int i = 0; i < rank; i++) { + real_dims.push_back(i); + } + TensorCopySync(*weight, ctx.GetPlace(), &weight_mat); + TensorCopySync(*out_grad, ctx.GetPlace(), &out_grad_mat); + } + weight_mat = weight_mat.Resize({h, w}); + out_grad_mat = out_grad_mat.Resize({h, w}); + + Tensor sigma; + sigma.mutable_data(weight_mat.dims(), ctx.GetPlace()); + Tensor uu, vv; + TensorCopySync(*u, ctx.GetPlace(), &uu); + TensorCopySync(*v, ctx.GetPlace(), &vv); + CalcMatrixSigmaAndNormWeight( + &sigma, &(uu.Resize({h, 1})), &(vv.Resize({w, 1})), &weight_mat, + power_iters, eps, ctx); + + Tensor uv; + uv.mutable_data({h, w}, ctx.GetPlace()); + blas.MatMul(uu.Resize({h, 1}), false, vv.Resize({w, 1}), false, T(1), &uv, + T(0)); + + Tensor weight_grad_mat; + weight_grad_mat.mutable_data({h, w}, ctx.GetPlace()); + auto weight_grad_mat_t = EigenTensor::From(weight_grad_mat); + auto weight_mat_t = EigenTensor::From(weight_mat); + auto out_grad_mat_t = EigenTensor::From(out_grad_mat); + auto sigma_t = EigenTensor::From(sigma); + auto uv_t = EigenTensor::From(uv); + weight_mat_t.device(place) = + weight_mat_t.sum().eval().reshape(Array2(1, 1)).broadcast(Array2(h, w)); + weight_grad_mat_t.device(place) = + out_grad_mat_t * (out_grad_mat_t.constant(1.0) - uv_t * weight_mat_t) / + sigma_t; + + if (dim != 0) { + std::vector perm; + for (int i = 0; i < rank; i++) { + if (i < dim) { + perm.push_back(i + 1); + } else if (i == dim) { + perm.push_back(0); + } else { + perm.push_back(i); + } + } + weight_grad->mutable_data(dims, ctx.GetPlace()); + TransCompute( + rank, weight_grad_mat.Resize(framework::make_ddim(real_dims)), + weight_grad, perm, dev_ctx); + } else { + TensorCopySync(weight_grad_mat.Resize(dims), ctx.GetPlace(), weight_grad); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc index c8ee13875c5ae772de3c09f97fded8f70c5698e6..640644a94690d9682a5e6b1aa788a9ebdc5d2a54 100644 --- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc +++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc @@ -117,11 +117,11 @@ class TeacherStudentSigmoidLossOpMaker "[N x 1]. The teacher student sigmoid loss."); AddAttr( "soft_max_up_bound", - "fp32, if input > soft_max_up_bound, will be bound, default 15.0") + "fp32, if input > soft_max_up_bound, input will be bound, default 15.0") .SetDefault(15.0); - AddAttr( - "soft_max_lower_bound", - "fp32, if input < soft_max_lower_bound, will be bound, default -15.0") + AddAttr("soft_max_lower_bound", + "fp32, if input < soft_max_lower_bound, input will be " + "bound, default -15.0") .SetDefault(-15.0); AddComment(R"DOC( TeacherStudentSigmoidLoss Operator. @@ -134,7 +134,7 @@ we add another label(z') to original. label = {-2, -1, [0, 2]} when z' is not exist, clk = 0 : label = -2; when z' is not exist, clk = 1 : label = -1; - when z' is exist , clk = 0 : label = 0 + z'; + when z' is exist , clk = 0 : label = 0 + z'; when z' is exist , clk = 1 : label = 1 + z'; )DOC"); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index 031335009b692f9d1f73070c88e8e79d852cbe36..a8c86de9f9a1aea9ecdedd750757ec7d25cdf2f3 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -30,6 +30,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Ys", "A list of outputs").AsDuplicable(); AddAttr("subgraph", "the subgraph."); AddAttr("calibration_data", "the calibration data for int8"); + AddAttr( + "engine_serialized_data", + "the serialized data contains the all info of the ICUDAEngine"); AddAttr( "engine_key", "The engine_key here is used to distinguish different TRT Engines"); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 2ff35c7c6ac6409d529de5b794bfc322b1f5dd9b..c36673312489738ad0475a0b70a23a1c6c948b9d 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -16,8 +16,10 @@ #ifdef PADDLE_WITH_CUDA +#include #include #include +#include #include #include "paddle/fluid/framework/executor.h" @@ -31,37 +33,6 @@ namespace paddle { namespace operators { -using FluidDT = framework::proto::VarType_Type; -using TRT_DT = nvinfer1::DataType; - -namespace { // NOLINT - -TRT_DT FluidDataType2TRT(FluidDT type) { - switch (type) { - case FluidDT::VarType_Type_FP32: - return TRT_DT::kFLOAT; - case FluidDT::VarType_Type_INT32: - return TRT_DT::kINT32; - default: - return TRT_DT::kINT32; - } - PADDLE_THROW("unkown type"); - return TRT_DT::kINT32; -} - -nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { - PADDLE_ENFORCE_GT(shape.size(), 1UL, - "TensorRT' tensor input requires at least 2 dimensions"); - PADDLE_ENFORCE_LE(shape.size(), 4UL, - "TensorRT' tensor input requires at most 4 dimensions"); - PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL); - if (shape.size() == 4UL) - return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]); - return nvinfer1::DimsCHW(shape[1], 1, 1); -} - -} // namespace // NOLINT - using inference::Singleton; using inference::tensorrt::TensorRTEngine; using inference::tensorrt::TRTInt8Calibrator; @@ -79,6 +50,7 @@ class TensorRTEngineOp : public framework::OperatorBase { bool enable_int8_; std::string calibration_data_; std::string engine_key_; + std::string engine_serialized_data_; bool calibration_mode_; public: @@ -93,6 +65,7 @@ class TensorRTEngineOp : public framework::OperatorBase { enable_int8_ = Attr("enable_int8"); calibration_data_ = Attr("calibration_data"); engine_key_ = Attr("engine_key"); + engine_serialized_data_ = Attr("engine_serialized_data"); auto params = Attr>("parameters"); for (const auto ¶m : params) { @@ -125,7 +98,8 @@ class TensorRTEngineOp : public framework::OperatorBase { RunCalibration(scope, dev_place); return; } - RunTrt(scope, dev_place); + auto *trt_engine = GetEngine(scope, dev_place); + RunTrt(scope, dev_place, trt_engine); } void RunCalibration(const framework::Scope &scope, @@ -136,10 +110,6 @@ class TensorRTEngineOp : public framework::OperatorBase { LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_ << " is running calibration trt int8... "; int runtime_batch = 1; - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(dev_place); - auto stream = - reinterpret_cast(dev_ctx).stream(); if (!Singleton::Global().Has(engine_key_)) { TRTCalibratorEngine *calib_res = Singleton::Global().Create(engine_key_); @@ -156,11 +126,11 @@ class TensorRTEngineOp : public framework::OperatorBase { calib_buffers, runtime_batch, engine_key_, dev_place)); calib_res->thr_.reset(new std::thread([&]() { calib_res->engine_.reset(new TensorRTEngine( - max_batch_size_, workspace_size_, stream, - boost::get(dev_place).device, enable_int8_, - calib_res->calib_.get())); + max_batch_size_, workspace_size_, enable_int8_, + calib_res->calib_.get(), + boost::get(dev_place).device)); VLOG(3) << "start the calib trt engine thread"; - Prepare(scope, dev_place, calib_res->engine_.get()); + PrepareTRTEngine(scope, calib_res->engine_.get()); })); } @@ -180,28 +150,29 @@ class TensorRTEngineOp : public framework::OperatorBase { RunNativeImpl(scope, dev_place); } - void RunTrt(const framework::Scope &scope, - const platform::Place &dev_place) const { + void RunTrt(const framework::Scope &scope, const platform::Place &dev_place, + TensorRTEngine *engine) const { int runtime_batch = 1; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(dev_place); auto stream = reinterpret_cast(dev_ctx).stream(); - if (trt_engine_.get() == nullptr) { - trt_engine_.reset( - new TensorRTEngine(max_batch_size_, workspace_size_, stream, - boost::get(dev_place).device, - enable_int8_, calibrator_.get())); - Prepare(scope, dev_place, trt_engine_.get()); - } - auto *engine = trt_engine_.get(); PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs"); std::vector output_maps = Attr>("output_name_mapping"); - // Convert input tensor from fluid to engine. + int num_inputs = 0; + + for (const auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; + num_inputs += 1; + } + const int num_bindings = num_inputs + Outputs("Ys").size(); + std::vector buffers(num_bindings); + + // Bind input tensor to TRT. for (const auto &x : Inputs("Xs")) { if (param_names_.count(x)) continue; // convert input and copy to TRT engine's buffer @@ -209,28 +180,20 @@ class TensorRTEngineOp : public framework::OperatorBase { inference::analysis::GetFromScope(scope, x); auto t_shape = framework::vectorize(t.dims()); runtime_batch = t_shape[0]; - if (platform::is_cpu_place(t.place())) { - engine->SetInputFromCPU(x, static_cast(t.data()), - t.memory_size()); - } else { - engine->SetInputFromGPU(x, static_cast(t.data()), - t.memory_size()); - } - } - cudaStreamSynchronize(stream); - PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_); - // Execute the engine. - engine->Execute(runtime_batch); + const int bind_index = engine->engine()->getBindingIndex(x.c_str()); + PADDLE_ENFORCE(bind_index < num_bindings, + "The bind index should be less than num_bindings"); + buffers[bind_index] = static_cast(t.data()); + } - // Convert output tensor from engine to fluid + // Bind output tensor to TRT. int output_index = 0; VLOG(4) << "TensorRT Engine Op Outputs:"; for (const auto &y : Outputs("Ys")) { - VLOG(4) << y; - // convert output and copy to fluid. - nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]); - auto dims = trt_t->getDimensions(); + const int bind_index = + engine->engine()->getBindingIndex(output_maps[output_index].c_str()); + auto dims = engine->engine()->getBindingDimensions(bind_index); // Use the output ITensor's dims to reshape the Fluid Tensor. // The ITensor doesn't contain the batch size dim. std::vector ddim; @@ -238,71 +201,55 @@ class TensorRTEngineOp : public framework::OperatorBase { for (int i = 0; i < dims.nbDims; i++) { ddim.push_back(dims.d[i]); } - auto *fluid_v = scope.FindVar(y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); auto *fluid_t = fluid_v->GetMutable(); - fluid_t->Resize(framework::make_ddim(ddim)); - // TODO(Superjomn) change this float to dtype size. - auto size = - inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch; - engine->GetOutputInGPU( - output_maps[output_index], - fluid_t->mutable_data(platform::CUDAPlace( - boost::get(dev_place).device)), - size * sizeof(float)); + PADDLE_ENFORCE(bind_index < num_bindings, + "The bind index should be less than num_bindings"); + buffers[bind_index] = static_cast(fluid_t->mutable_data( + boost::get(dev_place))); + output_index += 1; } + PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_); + // Execute the engine. + engine->Execute(runtime_batch, &buffers, stream); cudaStreamSynchronize(stream); } - void Prepare(const framework::Scope &scope, const platform::Place &dev_place, - TensorRTEngine *engine) const { + TensorRTEngine *GetEngine(const framework::Scope &scope, + const platform::Place &dev_place) const { + if (!trt_engine_) { + trt_engine_.reset(new inference::tensorrt::TensorRTEngine( + max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(), + boost::get(dev_place).device)); + if (!engine_serialized_data_.empty()) { + trt_engine_->Deserialize(engine_serialized_data_); + } else { + PrepareTRTEngine(scope, trt_engine_.get()); + } + } + return trt_engine_.get(); + } + + void PrepareTRTEngine(const framework::Scope &scope, + TensorRTEngine *engine) const { LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " "kernel etc). This process may cost a lot of time."; - framework::proto::BlockDesc block_desc; - block_desc.ParseFromString(Attr("subgraph")); + framework::proto::BlockDesc block_proto; + block_proto.ParseFromString(Attr("subgraph")); + framework::BlockDesc block_desc(nullptr, &block_proto); - std::vector output_maps = + std::vector inputs = Inputs("Xs"); + std::vector outputs = Attr>("output_name_mapping"); - engine->InitNetwork(); - - framework::BlockDesc block(nullptr /*programdesc*/, &block_desc); - VLOG(4) << "parsed var size " << block.AllVars().size(); - // Add inputs - VLOG(4) << "declare inputs"; - for (auto &input : Inputs("Xs")) { - if (param_names_.count(input)) continue; - VLOG(4) << "declare input " << input; - - auto &t = - inference::analysis::GetFromScope(scope, input); - auto t_shape = framework::vectorize(t.dims()); - - auto *var = block.FindVar(input); - // TensorRT engine need to create parameters. The parameter's description - // should be set in - PADDLE_ENFORCE(var, "no variable called %s", input); - PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, - "TensorRT engine only takes LoDTensor as input"); - - engine->DeclareInput( - input, FluidDataType2TRT( - var->Proto()->type().lod_tensor().tensor().data_type()), - Vec2TRT_Dims(t_shape)); - } inference::Singleton::Global() - .ConvertBlock(block_desc, param_names_, scope, engine); - - // Add outputs - for (auto &output : output_maps) { - engine->DeclareOutput(output); - } - engine->FreezeNetwork(); + .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_, + outputs, engine); } }; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 5a3d9d2c1a3e8111acbad2ddcf4f5469a3a99751..e7ad2f4fe0c654d8928f5793c1ad8052ab766fb5 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -107,6 +107,7 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetAttr("output_name_mapping", std::vector({"z0"})); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); + engine_op_desc.SetAttr("engine_serialized_data", std::string("")); LOG(INFO) << "create engine op"; auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); @@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetAttr("output_name_mapping", std::vector({"z3"})); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); + engine_op_desc.SetAttr("engine_serialized_data", std::string("")); auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index fbb2ac3fe8c5de9b0be593df225677c6a7a89e9c..9220d35707b286d76ab4824e3f1080453f60bfe6 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce) -cc_library(place SRCS place.cc DEPS enforce boost lib_any) +cc_library(place SRCS place.cc DEPS enforce boost) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) add_subdirectory(dynload) @@ -82,13 +82,18 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_ cc_test(init_test SRCS init_test.cc DEPS device_context) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) +nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) cc_library(timer SRCS timer.cc) cc_test(timer_test SRCS timer_test.cc DEPS timer) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) -cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) +if(WITH_GPU) + nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce) +else() + cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce) +endif() cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..1062b403f289610a6dec28dead9177d387f0d4e0 --- /dev/null +++ b/paddle/fluid/platform/cudnn_desc.h @@ -0,0 +1,124 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace platform { +using framework::Tensor; + +template +cudnnDataType_t ToCudnnDataType(const T& t) { + auto type = framework::ToDataType(t); + return ToCudnnDataType(type); +} + +template <> +cudnnDataType_t ToCudnnDataType(const framework::proto::VarType::Type& t) { + cudnnDataType_t type = CUDNN_DATA_FLOAT; + switch (t) { + case framework::proto::VarType::FP16: + type = CUDNN_DATA_HALF; + break; + case framework::proto::VarType::FP32: + type = CUDNN_DATA_FLOAT; + break; + case framework::proto::VarType::FP64: + type = CUDNN_DATA_DOUBLE; + break; + default: + break; + } + return type; +} + +class ActivationDescriptor { + public: + using T = cudnnActivationStruct; + struct Deleter { + void operator()(T* t) { + if (t != nullptr) { + PADDLE_ENFORCE(dynload::cudnnDestroyActivationDescriptor(t)); + t = nullptr; + } + } + }; + ActivationDescriptor() { + T* raw_ptr; + PADDLE_ENFORCE(dynload::cudnnCreateActivationDescriptor(&raw_ptr)); + desc_.reset(raw_ptr); + } + template + void set(cudnnActivationMode_t mode, const T& coef) { + CUDNN_ENFORCE(dynload::cudnnSetActivationDescriptor( + desc_.get(), mode, CUDNN_NOT_PROPAGATE_NAN, static_cast(coef))); + } + + T* desc() { return desc_.get(); } + T* desc() const { return desc_.get(); } + + private: + std::unique_ptr desc_; +}; + +class TensorDescriptor { + public: + using T = cudnnTensorStruct; + struct Deleter { + void operator()(T* t) { + if (t != nullptr) { + PADDLE_ENFORCE(dynload::cudnnDestroyTensorDescriptor(t)); + t = nullptr; + } + } + }; + TensorDescriptor() { + T* raw_ptr; + PADDLE_ENFORCE(dynload::cudnnCreateTensorDescriptor(&raw_ptr)); + desc_.reset(raw_ptr); + } + T* desc() { return desc_.get(); } + T* desc() const { return desc_.get(); } + void set(const Tensor& tensor, const int groups = 1) { + auto dims = framework::vectorize2int(tensor.dims()); + std::vector strides(dims.size()); + strides[dims.size() - 1] = 1; + for (int i = dims.size() - 2; i >= 0; i--) { + strides[i] = dims[i + 1] * strides[i + 1]; + } + std::vector dims_with_group(dims.begin(), dims.end()); + if (groups > 1) { + dims_with_group[1] = dims_with_group[1] / groups; + } + PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor( + desc_.get(), ToCudnnDataType(tensor.type()), dims_with_group.size(), + dims_with_group.data(), strides.data())); + } + + private: + std::unique_ptr desc_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/cudnn_desc_test.cc b/paddle/fluid/platform/cudnn_desc_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a60102a54899b25c89d8c131220dde21f77bba70 --- /dev/null +++ b/paddle/fluid/platform/cudnn_desc_test.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/cudnn_desc.h" +#include + +namespace paddle { +namespace platform { + +TEST(TensorDescriptor, Empty) { + ActivationDescriptor a; + TensorDescriptor t; + TensorDescriptor t1; + TensorDescriptor *t11 = new TensorDescriptor(); + delete t11; + std::unique_ptr tt(new TensorDescriptor()); +} + +TEST(TensorDescriptor, Normal) { + framework::Tensor tt; + tt.Resize({2, 3, 4}); + tt.mutable_data(platform::CPUPlace()); + + TensorDescriptor desc; + desc.set(tt); + EXPECT_TRUE(desc.desc() != nullptr); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 2493fb71c019f9923012afa4a46cb3e95479f860..920b43b2b1990af58b73888bf7a652d57c20563c 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -291,7 +291,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) if (dynload::HasCUDNN()) { auto local_cudnn_version = cudnn_dso_ver / 100; auto compile_cudnn_version = CUDNN_VERSION / 100; - if (local_cudnn_version < compile_cudnn_version) { + if (local_cudnn_version < static_cast(compile_cudnn_version)) { LOG_FIRST_N(WARNING, 1) << "WARNING: device: " << place_.device << ". The installed Paddle is compiled with CUDNN " @@ -394,7 +394,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name, int tid = platform::get_cur_thread_id(); - std::lock_guard lock(*p_mutex_.get()); + std::lock_guard lock(*p_mutex_); // Find KeyBlob for current thread auto map_it = pMap->find(tid); @@ -427,7 +427,7 @@ std::shared_ptr MKLDNNDeviceContext::GetBlob( int tid = platform::get_cur_thread_id(); - std::lock_guard lock(*p_mutex_.get()); + std::lock_guard lock(*p_mutex_); // Find KeyBlob for current thread firstly auto map_it = pMap->find(tid); diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 0a4563ead65b1e45adca1d1a1fce066a1a55d932..8458b17f82a976bad37df58dddc2c0d80c8eb13e 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -11,20 +11,26 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/device_tracer.h" #include +#include #include +#include #include #include // NOLINT #include +#include #include #include // NOLINT +#include +#include #include #include "glog/logging.h" #include "google/protobuf/text_format.h" #include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/platform/device_tracer.h" +#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" namespace paddle { @@ -33,17 +39,31 @@ namespace { // Tracking the nested block stacks of each thread. thread_local std::deque block_id_stack; // Tracking the nested event stacks. -thread_local std::deque annotation_stack; +thread_local std::deque annotation_stack; + +std::map system_thread_id_map; std::once_flag tracer_once_flag; DeviceTracer *tracer = nullptr; + +void PrintCuptiHint() { + static bool showed = false; + if (showed) return; + showed = true; + LOG(WARNING) << "Invalid timestamp occured. Please try increasing the " + "FLAGS_multiple_of_cupti_buffer_size."; +} + } // namespace #ifdef PADDLE_WITH_CUPTI namespace { -// TODO(panyx0718): Revisit the buffer size here. -uint64_t kBufSize = 32 * 1024; +// The experimental best performance is +// the same size with CUPTI device buffer size(8M) +uint64_t kBufSize = 1024 * 1024 * 8; uint64_t kAlignSize = 8; +std::unordered_map runtime_cbid_str, + driver_cbid_str; #define ALIGN_BUFFER(buffer, align) \ (((uintptr_t)(buffer) & ((align)-1)) \ @@ -92,15 +112,33 @@ std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) { return "MEMCPY"; } +std::string DriverKind(CUpti_CallbackId cbid) { + auto iter = driver_cbid_str.find(cbid); + if (iter == driver_cbid_str.end()) + return "Driver API " + std::to_string(cbid); + return iter->second; +} + +std::string RuntimeKind(CUpti_CallbackId cbid) { + auto iter = runtime_cbid_str.find(cbid); + if (iter == runtime_cbid_str.end()) + return "Runtime API " + std::to_string(cbid); + return iter->second; +} + void EnableActivity() { // Device activity record is created when CUDA initializes, so we // want to enable it before cuInit() or any CUDA runtime call. CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); - CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)); - CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); - CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); - CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); + CUPTI_CALL( + dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); + // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)); + CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); + CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); // We don't track these activities for now. + CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); + // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); + // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT)); // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); @@ -110,16 +148,17 @@ void EnableActivity() { void DisableActivity() { CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY)); - CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL)); - CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE)); + CUPTI_CALL( + dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); + // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE)); // Disable all other activity record kinds. - CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT)); + // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT)); CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER)); CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME)); CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET)); - CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME)); - CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER)); - CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD)); + // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME)); + // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER)); + // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD)); } void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, @@ -132,6 +171,11 @@ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) { + static std::thread::id cupti_thread_id(0); + if (cupti_thread_id == std::thread::id(0)) + cupti_thread_id = std::this_thread::get_id(); + PADDLE_ENFORCE_EQ(std::this_thread::get_id(), cupti_thread_id, + "Only one thread is allowed to call bufferCompleted()"); CUptiResult status; CUpti_Activity *record = NULL; if (validSize > 0) { @@ -168,6 +212,36 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, memcpy->correlationId, memcpy->bytes); break; } + case CUPTI_ACTIVITY_KIND_MEMSET: { + auto *memset = + reinterpret_cast(record); + tracer->AddKernelRecords("MEMSET", memset->start, memset->end, + memset->deviceId, memset->streamId, + memset->correlationId); + break; + } + case CUPTI_ACTIVITY_KIND_DRIVER: { + auto *api = reinterpret_cast(record); + if (api->start != 0 && api->end != 0) { + // -1 device id represents ActiveKind api call + tracer->AddActiveKindRecords( + DriverKind(api->cbid), api->start, api->end, -1, + GetThreadIdFromSystemThreadId(api->threadId), + api->correlationId); + } + break; + } + case CUPTI_ACTIVITY_KIND_RUNTIME: { + auto *api = reinterpret_cast(record); + if (api->start != 0 && api->end != 0) { + // -1 device id represents ActiveKind api call + tracer->AddActiveKindRecords( + RuntimeKind(api->cbid), api->start, api->end, -1, + GetThreadIdFromSystemThreadId(api->threadId), + api->correlationId); + } + break; + } default: { break; } } } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { @@ -183,21 +257,35 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, dynload::cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); if (dropped != 0) { fprintf(stderr, "Dropped %u activity records\n", (unsigned int)dropped); + PrintCuptiHint(); } } free(buffer); } + +void initCuptiCbidStr(); + } // namespace #endif // PADDLE_WITH_CUPTI class DeviceTracerImpl : public DeviceTracer { public: - DeviceTracerImpl() : enabled_(false) {} + DeviceTracerImpl() : enabled_(false) { +#ifdef PADDLE_WITH_CUPTI + initCuptiCbidStr(); +#endif + } - void AddAnnotation(uint64_t id, const std::string &anno) { - std::lock_guard l(trace_mu_); - correlations_[id] = anno; + void AddAnnotation(uint32_t id, Event *event) { + thread_local std::forward_list> + *local_correlations_pairs = nullptr; + if (local_correlations_pairs == nullptr) { + std::lock_guard l(trace_mu_); + correlations_pairs.emplace_front(); + local_correlations_pairs = &correlations_pairs.front(); + } + local_correlations_pairs->push_front(std::make_pair(id, event)); } void AddCPURecords(const std::string &anno, uint64_t start_ns, @@ -206,8 +294,13 @@ class DeviceTracerImpl : public DeviceTracer { VLOG(1) << "Empty timeline annotation."; return; } - std::lock_guard l(trace_mu_); - cpu_records_.push_back( + thread_local std::forward_list *local_cpu_records_ = nullptr; + if (local_cpu_records_ == nullptr) { + std::lock_guard l(trace_mu_); + cpu_records_.emplace_front(); + local_cpu_records_ = &cpu_records_.front(); + } + local_cpu_records_->push_front( CPURecord{anno, start_ns, end_ns, device_id, thread_id}); } @@ -215,25 +308,64 @@ class DeviceTracerImpl : public DeviceTracer { uint64_t end_ns, int64_t device_id, int64_t stream_id, uint32_t correlation_id, uint64_t bytes) { // 0 means timestamp information could not be collected for the kernel. - if (start_ns == 0 || end_ns == 0) { + if (start_ns == 0 || end_ns == 0 || start_ns == end_ns) { VLOG(3) << name << " cannot be traced"; + PrintCuptiHint(); return; } - std::lock_guard l(trace_mu_); - mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id, - stream_id, correlation_id, bytes}); + // NOTE(liangdun): lock is not needed, only one thread call this function. + mem_records_.push_front(MemRecord{name, start_ns, end_ns, device_id, + stream_id, correlation_id, bytes}); + } + + void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes, + const Place &place, const std::string &alloc_in, + const std::string &free_in, int64_t thread_id) { + if (0 == start_ns || 0 == end_ns) { + VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced."; + return; + } + thread_local std::forward_list *local_mem_info_record = + nullptr; + if (local_mem_info_record == nullptr) { + std::lock_guard l(trace_mu_); + mem_info_record_.emplace_front(); + local_mem_info_record = &mem_info_record_.front(); + } + local_mem_info_record->emplace_front(MemInfoRecord{ + start_ns, end_ns, bytes, place, thread_id, alloc_in, free_in}); + } + + void AddActiveKindRecords(const std::string &anno, uint64_t start_ns, + uint64_t end_ns, int64_t device_id, + int64_t thread_id, uint32_t correlation_id) { + if (anno.empty()) { + VLOG(1) << "Empty timeline annotation."; + return; + } + thread_local std::forward_list + *local_active_kind_records = nullptr; + if (local_active_kind_records == nullptr) { + std::lock_guard l(trace_mu_); + active_kind_records_.emplace_front(); + local_active_kind_records = &active_kind_records_.front(); + } + // lock is not needed, only one thread call this function. + local_active_kind_records->push_front(ActiveKindRecord{ + anno, start_ns, end_ns, device_id, thread_id, correlation_id}); } void AddKernelRecords(std::string name, uint64_t start, uint64_t end, int64_t device_id, int64_t stream_id, uint32_t correlation_id) { // 0 means timestamp information could not be collected for the kernel. - if (start == 0 || end == 0) { + if (start == 0 || end == 0 || start == end) { VLOG(3) << correlation_id << " cannot be traced"; + PrintCuptiHint(); return; } - std::lock_guard l(trace_mu_); - kernel_records_.push_back( + // NOTE(liangdun): lock is not needed, only one thread call this function. + kernel_records_.push_front( KernelRecord{name, start, end, device_id, stream_id, correlation_id}); } @@ -263,25 +395,88 @@ class DeviceTracerImpl : public DeviceTracer { } else if (ret != CUPTI_SUCCESS) { fprintf(stderr, "Failed to create CUPTI subscriber.\n"); } - CUPTI_CALL( - dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, - CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)); + const std::vector cbids { + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020, + CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020, + CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020, + CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000 +#if CUDA_VERSION >= 9000 + , + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000 +#endif + }; + for (auto cbid : cbids) + CUPTI_CALL(dynload::cuptiEnableCallback( + 1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API, cbid)); CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_)); #endif // PADDLE_WITH_CUPTI enabled_ = true; } + void Reset() { +#ifdef PADDLE_WITH_CUPTI + CUPTI_CALL( + dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED)); +#endif + std::lock_guard l(trace_mu_); + kernel_records_.clear(); + mem_records_.clear(); + correlations_.clear(); + for (auto &tmp : correlations_pairs) tmp.clear(); + for (auto &tmp : cpu_records_) tmp.clear(); + for (auto &tmp : mem_info_record_) tmp.clear(); + for (auto &tmp : active_kind_records_) tmp.clear(); + } + + void GenEventKernelCudaElapsedTime() { +#ifdef PADDLE_WITH_CUPTI + if (correlations_.empty()) + for (auto &tmp : correlations_pairs) + for (auto &pair : tmp) correlations_[pair.first] = pair.second; + for (const KernelRecord &r : kernel_records_) { + auto c = correlations_.find(r.correlation_id); + if (c != correlations_.end() && c->second != nullptr) { + Event *e = c->second; + e->AddCudaElapsedTime(r.start_ns, r.end_ns); + } + } + for (const auto &r : mem_records_) { + auto c = correlations_.find(r.correlation_id); + if (c != correlations_.end() && c->second != nullptr) { + Event *e = c->second; + e->AddCudaElapsedTime(r.start_ns, r.end_ns); + } + } +#endif + } + proto::Profile GenProfile(const std::string &profile_path) { + int miss = 0, find = 0; std::lock_guard l(trace_mu_); proto::Profile profile_pb; profile_pb.set_start_ns(start_ns_); profile_pb.set_end_ns(end_ns_); + if (correlations_.empty()) { + for (auto &tmp : correlations_pairs) { + for (auto &pair : tmp) correlations_[pair.first] = pair.second; + } + } + for (const KernelRecord &r : kernel_records_) { auto *event = profile_pb.add_events(); event->set_type(proto::Event::GPUKernel); - if (correlations_.find(r.correlation_id) != correlations_.end()) { - event->set_name(correlations_.at(r.correlation_id)); + auto c = correlations_.find(r.correlation_id); + if (c != correlations_.end() && c->second != nullptr) { + event->set_name(c->second->name()); + event->set_detail_info(r.name); + find++; } else { + VLOG(10) << "Missing Kernel Event: " + r.name; + miss++; event->set_name(r.name); } event->set_start_ns(r.start_ns); @@ -289,31 +484,86 @@ class DeviceTracerImpl : public DeviceTracer { event->set_sub_device_id(r.stream_id); event->set_device_id(r.device_id); } + VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find; - for (const CPURecord &r : cpu_records_) { - auto *event = profile_pb.add_events(); - event->set_type(proto::Event::CPU); - event->set_name(r.name); - event->set_start_ns(r.start_ns); - event->set_end_ns(r.end_ns); - event->set_sub_device_id(r.thread_id); - event->set_device_id(r.device_id); + for (auto &tmp : cpu_records_) { + for (const CPURecord &r : tmp) { + auto *event = profile_pb.add_events(); + event->set_type(proto::Event::CPU); + event->set_name(r.name); + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_sub_device_id(r.thread_id); + event->set_device_id(r.device_id); + } } + + for (auto &tmp : active_kind_records_) { + for (const ActiveKindRecord &r : tmp) { + auto *event = profile_pb.add_events(); + event->set_type(proto::Event::CPU); + auto c = correlations_.find(r.correlation_id); + if (c != correlations_.end() && c->second != nullptr) { + event->set_name(c->second->name()); + event->set_detail_info(r.name); + } else { + event->set_name(r.name); + } + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_sub_device_id(r.thread_id); + event->set_device_id(r.device_id); + } + } + miss = find = 0; for (const MemRecord &r : mem_records_) { auto *event = profile_pb.add_events(); event->set_type(proto::Event::GPUKernel); - event->set_name(r.name); + auto c = correlations_.find(r.correlation_id); + if (c != correlations_.end() && c->second != nullptr) { + event->set_name(c->second->name()); + event->set_detail_info(r.name); + find++; + } else { + miss++; + event->set_name(r.name); + } event->set_start_ns(r.start_ns); event->set_end_ns(r.end_ns); event->set_sub_device_id(r.stream_id); event->set_device_id(r.device_id); event->mutable_memcopy()->set_bytes(r.bytes); } + VLOG(1) << "MemRecord event miss: " << miss << " find: " << find; + + for (auto &tmp : mem_info_record_) { + for (const auto &r : tmp) { + auto *event = profile_pb.add_mem_events(); + event->set_device_id(0); + if (platform::is_cpu_place(r.place)) { + event->set_place(proto::MemEvent::CPUPlace); + } else if (platform::is_gpu_place(r.place)) { + event->set_place(proto::MemEvent::CUDAPlace); + event->set_device_id( + boost::get(r.place).GetDeviceId()); + } else if (platform::is_cuda_pinned_place(r.place)) { + event->set_place(proto::MemEvent::CUDAPinnedPlace); + } else { + PADDLE_THROW("The current place is not supported."); + } + event->set_alloc_in(r.alloc_in); + event->set_free_in(r.free_in); + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_bytes(r.bytes); + event->set_thread_id(r.thread_id); + } + } + std::ofstream profile_f; - profile_f.open(profile_path, std::ios::out | std::ios::trunc); - std::string profile_str; - profile_pb.SerializeToString(&profile_str); - profile_f << profile_str; + profile_f.open(profile_path, + std::ios::out | std::ios::trunc | std::ios::binary); + profile_pb.SerializeToOstream(&profile_f); profile_f.close(); return profile_pb; } @@ -321,12 +571,13 @@ class DeviceTracerImpl : public DeviceTracer { void Disable() { #ifdef PADDLE_WITH_CUPTI // flush might cause additional calls to DeviceTracker. - dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED); + CUPTI_CALL( + dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED)); #endif // PADDLE_WITH_CUPTI std::lock_guard l(trace_mu_); #ifdef PADDLE_WITH_CUPTI DisableActivity(); - dynload::cuptiUnsubscribe(subscriber_); + CUPTI_CALL(dynload::cuptiUnsubscribe(subscriber_)); CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_)); #endif // PADDLE_WITH_CUPTI enabled_ = false; @@ -337,18 +588,10 @@ class DeviceTracerImpl : public DeviceTracer { static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const void *cbdata) { auto *cbInfo = reinterpret_cast(cbdata); - DeviceTracer *tracer = reinterpret_cast(userdata); - - if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) && - (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) { - if (cbInfo->callbackSite == CUPTI_API_ENTER) { - const std::string anno = !annotation_stack.empty() - ? annotation_stack.back() - : cbInfo->symbolName; - tracer->AddAnnotation(cbInfo->correlationId, anno); - } - } else { - VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid; + DeviceTracerImpl *tracer = reinterpret_cast(userdata); + if (cbInfo->callbackSite == CUPTI_API_ENTER) { + Event *event = CurAnnotation(); + tracer->AddAnnotation(cbInfo->correlationId, event); } } CUpti_SubscriberHandle subscriber_; @@ -357,10 +600,14 @@ class DeviceTracerImpl : public DeviceTracer { bool enabled_; uint64_t start_ns_; uint64_t end_ns_; - std::vector kernel_records_; - std::vector mem_records_; - std::vector cpu_records_; - std::unordered_map correlations_; + std::forward_list kernel_records_; + std::forward_list mem_records_; + std::forward_list> cpu_records_; + std::forward_list> mem_info_record_; + std::forward_list> active_kind_records_; + std::forward_list>> + correlations_pairs; + std::unordered_map correlations_; }; void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); } @@ -370,21 +617,107 @@ DeviceTracer *GetDeviceTracer() { return tracer; } -void SetCurAnnotation(const std::string &anno) { - annotation_stack.push_back(anno); -} +void SetCurAnnotation(Event *event) { annotation_stack.push_back(event); } void ClearCurAnnotation() { annotation_stack.pop_back(); } -std::string CurAnnotation() { - if (annotation_stack.empty()) return ""; +Event *CurAnnotation() { + if (annotation_stack.empty()) return nullptr; return annotation_stack.back(); } +std::string CurAnnotationName() { + if (annotation_stack.empty()) return "Unknown"; + return annotation_stack.back()->name(); +} void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); } void ClearCurBlock() { block_id_stack.pop_back(); } int BlockDepth() { return block_id_stack.size(); } + +uint32_t GetCurSystemThreadId() { + std::stringstream ss; + ss << std::this_thread::get_id(); + uint32_t id = static_cast(std::stoull(ss.str())); + return id; +} + +void RecoreCurThreadId(int32_t id) { + auto gid = GetCurSystemThreadId(); + VLOG(1) << "RecoreCurThreadId: " << gid << " -> " << id; + system_thread_id_map[gid] = id; +} + +int32_t GetThreadIdFromSystemThreadId(uint32_t id) { + auto it = system_thread_id_map.find(id); + if (it != system_thread_id_map.end()) return it->second; + // return origin id if no event is recorded in this thread. + return static_cast(id); +} + +#ifdef PADDLE_WITH_CUPTI +namespace { + +void initCuptiCbidStr() { + static bool called = false; + if (called) return; + called = true; +#define REGISTER_RUNTIME_CBID_STR(cbid) \ + runtime_cbid_str[CUPTI_RUNTIME_TRACE_CBID_##cbid] = #cbid + + REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020); + REGISTER_RUNTIME_CBID_STR(cudaConfigureCall_v3020); + REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000); + REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050); + REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020); + REGISTER_RUNTIME_CBID_STR(cudaDriverGetVersion_v3020); + REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020); + REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020); + REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020); + REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020); + REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020); + REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020); + REGISTER_RUNTIME_CBID_STR(cudaFree_v3020); + REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020); + REGISTER_RUNTIME_CBID_STR(cudaGetDeviceCount_v3020); + REGISTER_RUNTIME_CBID_STR(cudaGetDeviceProperties_v3020); + REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020); + REGISTER_RUNTIME_CBID_STR(cudaGetErrorString_v3020); + REGISTER_RUNTIME_CBID_STR(cudaGetLastError_v3020); + REGISTER_RUNTIME_CBID_STR(cudaHostAlloc_v3020); + REGISTER_RUNTIME_CBID_STR(cudaHostGetDevicePointer_v3020); + REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000); + REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemcpy_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemset_v3020); + REGISTER_RUNTIME_CBID_STR( + cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000); + REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020); + REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020); + REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020); + REGISTER_RUNTIME_CBID_STR(cudaStreamCreate_v3020); + REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000); + REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050); + REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050); + REGISTER_RUNTIME_CBID_STR(cudaStreamSynchronize_v3020); + REGISTER_RUNTIME_CBID_STR(cudaStreamWaitEvent_v3020); + REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020); + REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020); + REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020); + REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010); +#if CUDA_VERSION >= 9000 + REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000); + REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000); +#endif + +#undef REGISTER_RUNTIME_CBID_STR +} +} // namespace +#endif // PADDLE_WITH_CUPTI + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index bf0786be2d0fafbf4b610d16ef587ac219399203..85168a046fb3fa4317956737871cde56e15bedfb 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -17,6 +17,8 @@ limitations under the License. */ #include #include "paddle/fluid/platform/dynload/cupti.h" +#include "paddle/fluid/platform/event.h" +#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.pb.h" @@ -46,6 +48,7 @@ class DeviceTracer { int64_t stream_id; uint32_t correlation_id; }; + struct CPURecord { std::string name; uint64_t start_ns; @@ -53,6 +56,7 @@ class DeviceTracer { int64_t device_id; int64_t thread_id; }; + struct MemRecord { std::string name; uint64_t start_ns; @@ -63,16 +67,37 @@ class DeviceTracer { uint64_t bytes; }; + struct MemInfoRecord { + uint64_t start_ns; + uint64_t end_ns; + size_t bytes; + Place place; + int64_t thread_id; + std::string alloc_in; + std::string free_in; + }; + + struct ActiveKindRecord { + std::string name; + uint64_t start_ns; + uint64_t end_ns; + int64_t device_id; + int64_t thread_id; + uint32_t correlation_id; + }; + virtual ~DeviceTracer() {} // Needs to be called once before use. virtual void Enable() = 0; // Needs to be called once after use. virtual void Disable() = 0; + // Needs to be called once before reuse. + virtual void Reset() = 0; // Add a pair to correlate internal cuda id with high level - // annotation (string). So cuda statistics can be represented by + // annotation event(with string). So cuda statistics can be represented by // human-readable annotations. - virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0; + virtual void AddAnnotation(uint32_t id, Event* event) = 0; virtual void AddMemRecords(const std::string& name, uint64_t start_ns, uint64_t end_ns, int64_t device_id, @@ -82,6 +107,16 @@ class DeviceTracer { virtual void AddCPURecords(const std::string& anno, uint64_t start_ns, uint64_t end_ns, int64_t device_id, int64_t thread_id) = 0; + virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns, + uint64_t end_ns, int64_t device_id, + int64_t thread_id, + uint32_t correlation_id) = 0; + + virtual void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, + size_t bytes, const Place& place, + const std::string& alloc_in, + const std::string& free_in, + int64_t thread_id) = 0; // Add a cuda kernel stats. `correlation_id` will be mapped to annotation // added before for human readability. @@ -92,6 +127,9 @@ class DeviceTracer { // Generate a proto after done (Disabled). virtual proto::Profile GenProfile(const std::string& profile_path) = 0; + // generate kernel elapsed time into Event + virtual void GenEventKernelCudaElapsedTime() = 0; + virtual bool IsEnabled() = 0; }; @@ -99,14 +137,19 @@ class DeviceTracer { DeviceTracer* GetDeviceTracer(); // Set a name for the cuda kernel operation being launched by the thread. -void SetCurAnnotation(const std::string& anno); +void SetCurAnnotation(Event* event); // Clear the name after the operation is done. void ClearCurAnnotation(); // Current name of the operation being run in the thread. -std::string CurAnnotation(); +std::string CurAnnotationName(); +Event* CurAnnotation(); void SetCurBlock(int block_id); void ClearCurBlock(); int BlockDepth(); + +// Set current thread id, so we can map the system thread id to thread id. +void RecoreCurThreadId(int32_t id); +int32_t GetThreadIdFromSystemThreadId(uint32_t id); } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 2f4f8101e4b957634d68fb0d64649ff8afba7c54..3008c166938d7db190e8f716ca925fda5ccebc25 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -99,6 +99,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(cudnnDestroy); \ __macro(cudnnSetStream); \ __macro(cudnnActivationForward); \ + __macro(cudnnActivationBackward); \ __macro(cudnnConvolutionForward); \ __macro(cudnnConvolutionBackwardBias); \ __macro(cudnnGetConvolutionForwardWorkspaceSize); \ diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index a260cda49138580b209e647af459e9392d9f18f1..a5b846f500f3677188b170dda76c65047d628064 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -86,6 +86,8 @@ extern void* mklml_dso_handle; __macro(vdPowx); \ __macro(vsInv); \ __macro(vdInv); \ + __macro(vmsErf); \ + __macro(vmdErf); \ __macro(MKL_Set_Num_Threads) MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 142d38f0609d963ce3ff45c595b8432b0e5edd21..bdb1d1bd3bf47ea89984587ae84d2aa84be232a4 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -31,7 +31,10 @@ limitations under the License. */ #include #include #include +#include +#include +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "glog/logging.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/port.h" @@ -233,9 +236,11 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) { #endif // __APPLE__ and windows #endif // PADDLE_WITH_CUDA -#define PADDLE_THROW(...) \ - throw ::paddle::platform::EnforceNotMet( \ - ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__) +#define PADDLE_THROW(...) \ + do { \ + throw ::paddle::platform::EnforceNotMet( \ + ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \ + } while (0) #define PADDLE_ENFORCE(COND, ...) \ do { \ @@ -270,23 +275,71 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) { * extra messages is also supported, for example: * PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2) */ -#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ - do { \ - if (UNLIKELY(nullptr == (__VAL))) { \ - PADDLE_THROW(#__VAL " should not be null\n%s", \ - paddle::string::Sprintf("" __VA_ARGS__)); \ - } \ +#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ + do { \ + if (UNLIKELY(nullptr == (__VAL))) { \ + PADDLE_THROW(#__VAL " should not be null\n%s", \ + ::paddle::string::Sprintf(__VA_ARGS__)); \ + } \ } while (0) -#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ +namespace details { +template +inline constexpr bool IsArithmetic() { + return std::is_arithmetic::value; +} + +template +struct TypeConverterImpl { + using Type1 = typename std::common_type::type; + using Type2 = Type1; +}; + +template +struct TypeConverterImpl { + using Type1 = T1; + using Type2 = T2; +}; + +template +struct TypeConverter { + private: + static constexpr bool kIsArithmetic = + IsArithmetic() && IsArithmetic(); + + public: + using Type1 = typename TypeConverterImpl::Type1; + using Type2 = typename TypeConverterImpl::Type2; +}; + +template +using CommonType1 = typename std::add_lvalue_reference< + typename std::add_const::Type1>::type>::type; + +template +using CommonType2 = typename std::add_lvalue_reference< + typename std::add_const::Type2>::type>::type; +} // namespace details + +#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...) \ do { \ - if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) { \ + auto __val1 = (__VAL1); \ + auto __val2 = (__VAL2); \ + using __TYPE1__ = decltype(__val1); \ + using __TYPE2__ = decltype(__val2); \ + using __COMMON_TYPE1__ = \ + ::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>; \ + using __COMMON_TYPE2__ = \ + ::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>; \ + bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP( \ + static_cast<__COMMON_TYPE2__>(__val2)); \ + if (UNLIKELY(!__is_not_error)) { \ PADDLE_THROW("Enforce failed. Expected %s " #__CMP \ " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \ - #__VAL0, #__VAL1, #__VAL0, \ - paddle::string::to_string(__VAL0), #__VAL1, \ - paddle::string::to_string(__VAL1), \ - paddle::string::Sprintf("" __VA_ARGS__)); \ + #__VAL1, #__VAL2, #__VAL1, \ + ::paddle::string::to_string(__val1), #__VAL2, \ + ::paddle::string::to_string(__val2), \ + ::paddle::string::Sprintf(__VA_ARGS__)); \ } \ } while (0) diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index 1091badae54a809c4a9da6d0398bcbb538420af0..adcc95367f11dfa2722226e5a0386bedfa6e746e 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -118,59 +118,58 @@ TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); } TEST(ENFORCE_GT, FAIL) { bool caught_exception = false; try { - PADDLE_ENFORCE_GT(1, 2UL); + PADDLE_ENFORCE_GT(1, 2); } catch (paddle::platform::EnforceNotMet error) { caught_exception = true; - EXPECT_TRUE(HasPrefix( - StringPiece(error.what()), - "Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2.")); + EXPECT_TRUE( + HasPrefix(StringPiece(error.what()), + "Enforce failed. Expected 1 > 2, but received 1:1 <= 2:2.")); } EXPECT_TRUE(caught_exception); } TEST(ENFORCE_GE, OK) { - PADDLE_ENFORCE_GE(2, 2UL); - PADDLE_ENFORCE_GE(3, 2UL); + PADDLE_ENFORCE_GE(2, 2); PADDLE_ENFORCE_GE(3, 2); - PADDLE_ENFORCE_GE(3.21, 2UL); + PADDLE_ENFORCE_GE(3.21, 2.0); } TEST(ENFORCE_GE, FAIL) { bool caught_exception = false; try { - PADDLE_ENFORCE_GE(1, 2UL); + PADDLE_ENFORCE_GE(1, 2); } catch (paddle::platform::EnforceNotMet error) { caught_exception = true; - EXPECT_TRUE(HasPrefix( - StringPiece(error.what()), - "Enforce failed. Expected 1 >= 2UL, but received 1:1 < 2UL:2.")); + EXPECT_TRUE( + HasPrefix(StringPiece(error.what()), + "Enforce failed. Expected 1 >= 2, but received 1:1 < 2:2.")); } EXPECT_TRUE(caught_exception); } TEST(ENFORCE_LE, OK) { PADDLE_ENFORCE_LE(1, 1); - PADDLE_ENFORCE_LE(1, 1UL); - PADDLE_ENFORCE_LE(2, 3UL); - PADDLE_ENFORCE_LE(2UL, 3); - PADDLE_ENFORCE_LE(2UL, 3.2); + PADDLE_ENFORCE_LE(1UL, 1UL); + PADDLE_ENFORCE_LE(2, 3); + PADDLE_ENFORCE_LE(2UL, 3UL); + PADDLE_ENFORCE_LE(2.0, 3.2); } TEST(ENFORCE_LE, FAIL) { bool caught_exception = false; try { - PADDLE_ENFORCE_GT(1, 2UL); + PADDLE_ENFORCE_GT(1, 2); } catch (paddle::platform::EnforceNotMet error) { caught_exception = true; - EXPECT_TRUE(HasPrefix( - StringPiece(error.what()), - "Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2.")); + EXPECT_TRUE( + HasPrefix(StringPiece(error.what()), + "Enforce failed. Expected 1 > 2, but received 1:1 <= 2:2.")); } EXPECT_TRUE(caught_exception); } TEST(ENFORCE_LT, OK) { PADDLE_ENFORCE_LT(3, 10); - PADDLE_ENFORCE_LT(2, 3UL); - PADDLE_ENFORCE_LT(2UL, 3); + PADDLE_ENFORCE_LT(2UL, 3UL); + PADDLE_ENFORCE_LT(2, 3); } TEST(ENFORCE_LT, FAIL) { bool caught_exception = false; @@ -235,7 +234,13 @@ TEST(ENFORCE_USER_DEFINED_CLASS, EQ) { TEST(ENFORCE_USER_DEFINED_CLASS, NE) { Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}}; - ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet); + bool caught_exception = false; + try { + PADDLE_ENFORCE_EQ(a, b); + } catch (paddle::platform::EnforceNotMet&) { + caught_exception = true; + } + EXPECT_TRUE(caught_exception); } TEST(EOF_EXCEPTION, THROW_EOF) { diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h new file mode 100644 index 0000000000000000000000000000000000000000..e9bdb82a50fa4166cecdaea1de01d2f458f3da9a --- /dev/null +++ b/paddle/fluid/platform/event.h @@ -0,0 +1,101 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#ifdef PADDLE_WITH_CUDA +#include +#endif +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace platform { + +enum EventType { kMark, kPushRange, kPopRange }; + +class Event { + public: + // The DeviceContext is used to get the cuda stream. + // If CPU profiling mode, can pass nullptr. + Event(EventType type, std::string name, uint32_t thread_id); + + const EventType& type() const; + std::string name() const { return name_; } + uint32_t thread_id() const { return thread_id_; } + +#ifdef PADDLE_WITH_CUDA +#ifndef PADDLE_WITH_CUPTI + cudaEvent_t event() const { return event_; } + int device() const { return device_; } +#endif +#endif + + double CpuElapsedMs(const Event& e) const; + double CudaElapsedMs(const Event& e) const; + + private: + EventType type_; + std::string name_; + uint32_t thread_id_; + int64_t cpu_ns_; +#ifdef PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_CUPTI + int64_t gpu_ns_ = 0; + + public: + void AddCudaElapsedTime(int64_t start_ns, int64_t end_ns) { + gpu_ns_ += end_ns - start_ns; + } + + private: +#else + cudaEvent_t event_ = nullptr; + int device_ = -1; +#endif +#endif +}; + +class MemEvent { + public: + MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes, + Place place, int64_t thread_id, const std::string& annotation) + : type_(type), + start_ns_(start_ns), + end_ns_(end_ns), + bytes_(bytes), + place_(place), + thread_id_(thread_id), + annotation_(annotation) {} + + const EventType& type() const { return type_; } + uint64_t start_ns() const { return start_ns_; } + uint64_t end_ns() const { return end_ns_; } + size_t bytes() const { return bytes_; } + Place place() const { return place_; } + int64_t thread_id() const { return thread_id_; } + const std::string& annotation() const { return annotation_; } + + private: + EventType type_; + uint64_t start_ns_ = 0; + uint64_t end_ns_ = 0; + size_t bytes_; + Place place_; + int64_t thread_id_; + std::string annotation_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index ac86b38a61c9d8e3e946d9fb3f46d8feba7c034d..4dcf7e79043af008cb2067d90d12d629c5c2d0d9 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/string/split.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/dynload/cupti.h" #endif #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/init.h" @@ -30,6 +31,9 @@ limitations under the License. */ DEFINE_int32(paddle_num_threads, 1, "Number of threads for each paddle instance."); +DEFINE_int32(multiple_of_cupti_buffer_size, 1, + "Multiple of the CUPTI device buffer size. If the timestamps have " + "been dropped when you are profiling, try increasing this value."); namespace paddle { namespace framework { @@ -78,7 +82,32 @@ void InitP2P(std::vector devices) { #endif } +void InitCupti() { +#ifdef PADDLE_WITH_CUPTI + if (FLAGS_multiple_of_cupti_buffer_size == 1) return; + size_t attrValue = 0, attrValueSize = sizeof(size_t); +#define MULTIPLY_ATTR_VALUE(attr) \ + { \ + PADDLE_ENFORCE(!platform::dynload::cuptiActivityGetAttribute( \ + attr, &attrValueSize, &attrValue)); \ + attrValue *= FLAGS_multiple_of_cupti_buffer_size; \ + LOG(WARNING) << "Set " #attr " " << attrValue << " byte"; \ + PADDLE_ENFORCE(!platform::dynload::cuptiActivitySetAttribute( \ + attr, &attrValueSize, &attrValue)); \ + } + MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE); + MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP); +#if CUDA_VERSION >= 9000 + MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE); +#endif +#undef MULTIPLY_ATTR_VALUE +#endif +} + void InitDevices(bool init_p2p) { + // CUPTI attribute should be set before any CUDA context is created (see CUPTI + // documentation about CUpti_ActivityAttribute). + InitCupti(); /*Init all available devices by default */ std::vector devices; #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 269280d604a13a62046fb7811d34b7c69b61b50f..4fa6774f028bef901f6e11f2d3dafe52a10a548e 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -39,6 +39,45 @@ class MKLDNNHandler { return this->AcquireMemory(md, ptr, "@user_src_mem_p"); } + // TODO(jczaja): extract common part and make AcquireMemory + std::shared_ptr AcquireSrcMemory( + const mkldnn::memory::primitive_desc& mpd, void* ptr) { + auto local_key = key_ + "@user_src_mem_p"; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), + " find mem primitive in device context"); + if (mem_p == nullptr) { + mem_p = std::make_shared(mpd, ptr); + dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + // Mark that reusing happenned. All primitives from operator instance + // should be reused or none of them. So we check consistency + is_reusing_ = true; + } + return mem_p; + } + + std::shared_ptr AcquireWeightsMemory( + const mkldnn::memory::primitive_desc& mpd, void* ptr) { + auto local_key = key_ + "@user_weights_mem_p"; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), + " find mem primitive in device context"); + if (mem_p == nullptr) { + mem_p = std::make_shared(mpd, ptr); + dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + // Mark that reusing happenned. All primitives from operator instance + // should be reused or none of them. So we check consistency + is_reusing_ = true; + } + return mem_p; + } + std::shared_ptr AcquireWeightsMemory( const mkldnn::memory::desc& md, void* ptr, user_function custom_func = {}) { @@ -232,7 +271,6 @@ class MKLDNNHandler { AppendKey(key, suffix); } - protected: static void AppendKeyDims(std::string* key, const mkldnn::memory::dims& dims) { for (unsigned int i = 0; i < dims.size(); i++) { @@ -250,6 +288,7 @@ class MKLDNNHandler { key->append(s); } + protected: static std::string dims2str(const mkldnn::memory::dims& operand_dims) { std::string dstr = ""; for (size_t i = 0; i < operand_dims.size(); ++i) { @@ -263,6 +302,9 @@ class MKLDNNHandler { mkldnn::engine engine_; std::string key_; bool is_reusing_; + + public: + static constexpr int MaxKeyLength = 256; }; class TransposeMKLDNNHandler : public MKLDNNHandler { @@ -273,37 +315,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { mkldnn::engine engine, const std::string& base_key) : platform::MKLDNNHandler(dev_ctx, engine, base_key), dims_(dims), - axis_(axis), - logical_axis_(dims.size(), 0) {} - - std::shared_ptr AcquireSrcMemory( - const mkldnn::memory::format& fmt, void* ptr) { - auto local_key = key_ + "@user_src_mem_p"; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), - " find mem primitive in device context"); - if (mem_p == nullptr) { - // Make memory descriptor using input format, unless it - // cannot be trusted (nchw) then make up memory fmt manually - for (size_t i = 0; i < logical_axis_.size(); ++i) { - logical_axis_[i] = i; - } - auto src_md = fmt != mkldnn::memory::format::nchw - ? platform::MKLDNNMemDesc( - dims_, platform::MKLDNNGetDataType(), fmt) - : Axis2MemoryDesc(dims_, logical_axis_); - mem_p = std::make_shared( - mkldnn::memory::primitive_desc{src_md, engine_}, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - // Mark that reusing happenned. All primitives from operator instance - // should be reused or none of them. So we check consistency - is_reusing_ = true; - } - return mem_p; - } + axis_(axis) {} std::shared_ptr AcquireDstMemory(framework::Tensor* output, platform::Place place) { @@ -388,7 +400,6 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { private: std::vector dims_; std::vector axis_; - std::vector logical_axis_; }; template @@ -548,9 +559,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false), "Fail to find convolution primitive in device context"); if (conv_p == nullptr) { - conv_p = std::make_shared(*conv_pd_, *(src_memory_p), - *(weights_memory_p.get()), - *(dst_memory_p.get())); + conv_p = std::make_shared(*conv_pd_, *src_memory_p, + *weights_memory_p, *dst_memory_p); dev_ctx_.SetBlob(prim_key, conv_p); } else { @@ -570,9 +580,9 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false), "Fail to find convolution primitive in device context"); if (conv_p == nullptr) { - conv_p = std::make_shared( - *conv_pd_, *(src_memory_p), *(weights_memory_p.get()), - *(bias_memory_p.get()), *(dst_memory_p.get())); + conv_p = std::make_shared(*conv_pd_, *src_memory_p, + *weights_memory_p, *bias_memory_p, + *dst_memory_p); dev_ctx_.SetBlob(prim_key, conv_p); } else { diff --git a/paddle/fluid/platform/mkldnn_utils.h b/paddle/fluid/platform/mkldnn_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..8c511f97d12cfe299ad5629eff1871e8d156c850 --- /dev/null +++ b/paddle/fluid/platform/mkldnn_utils.h @@ -0,0 +1,69 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include + +namespace paddle { +namespace platform { + +inline mkldnn::memory::primitive_desc create_prim_desc_from_dims( + const std::vector& ltz, mkldnn::memory::format fmt, + mkldnn::memory::data_type data_type = mkldnn::memory::data_type::f32) { + mkldnn_memory_desc_t mem_fmt; + + mem_fmt.primitive_kind = mkldnn_memory; + mem_fmt.ndims = ltz.size(); + for (unsigned int i = 0; i < ltz.size(); ++i) { + mem_fmt.dims[i] = ltz[i]; // logical dimensions (nchw format, + // regardless physical layout) + } + mem_fmt.data_type = static_cast(data_type); + mem_fmt.format = static_cast(fmt); + + unsigned int total_stride = 1; + for (int i = ltz.size() - 1; i >= 0; --i) { + mem_fmt.layout_desc.blocking.padding_dims[i] = + ltz[i]; // logical dimensions (nchw format, regardless physical + // layout) + mem_fmt.layout_desc.blocking.block_dims[i] = 1; + mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0; // no offset + mem_fmt.layout_desc.blocking.strides[0][i] = total_stride; + mem_fmt.layout_desc.blocking.strides[1][i] = 1; + total_stride *= ltz[i]; + } + mem_fmt.layout_desc.blocking.offset_padding = 0; // no initial offset + + auto& pool = platform::DeviceContextPool::Instance(); + auto place = paddle::platform::CPUPlace(); + auto* dev_ctx = dynamic_cast(pool.Get(place)); + auto& cpu_engine = dev_ctx->GetEngine(); + return mkldnn::memory::primitive_desc(mem_fmt, cpu_engine); +} + +inline mkldnn::memory::primitive_desc create_prim_desc_from_format( + const std::vector& ltz, const mkldnn::memory::format format, + const mkldnn::memory::data_type data_type) { + auto md = mkldnn::memory::desc({ltz}, data_type, format); + auto& pool = platform::DeviceContextPool::Instance(); + auto place = paddle::platform::CPUPlace(); + auto dev_ctx = dynamic_cast(pool.Get(place)); + PADDLE_ENFORCE_NOT_NULL(dev_ctx, "Could not get valid device"); + auto& cpu_engine = dev_ctx->GetEngine(); + return mkldnn::memory::primitive_desc(md, cpu_engine); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/ngraph_helper.h b/paddle/fluid/platform/ngraph_helper.h index 5ee985ea719f8cb28bf8be23823eb6c96f4af1a3..e74f57a79a66ea8fe8c9b972a9a2ec9d722731eb 100644 --- a/paddle/fluid/platform/ngraph_helper.h +++ b/paddle/fluid/platform/ngraph_helper.h @@ -43,6 +43,13 @@ std::shared_ptr Nchw2Nhwc(std::shared_ptr in) { return std::make_shared(in, axis_vec, in_shape); } +ngraph::Shape FlattenTo1d(ngraph::Shape sh, int num) { + auto x1 = std::accumulate(std::begin(sh), std::end(sh) + num, 1, + std::multiplies()); + size_t x1_l = (size_t)x1; + return ngraph::Shape{x1_l}; +} + ngraph::Shape FlattenTo2d(ngraph::Shape sh, int num) { auto x1 = std::accumulate(std::begin(sh), std::begin(sh) + num, 1, std::multiplies()); diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 85977366e61c676fc5d2d3c5d22dd2f606543684..6d055a442106d88af39c771f3ddf156ba616c99f 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/platform/profiler.h" #include #include #include @@ -19,6 +20,8 @@ limitations under the License. */ #include // NOLINT #include #include +#include + #ifdef PADDLE_WITH_CUDA #include #endif // PADDLE_WITH_CUDA @@ -27,7 +30,6 @@ limitations under the License. */ #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/port.h" -#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); @@ -35,8 +37,6 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); namespace paddle { namespace platform { -struct EventList; - static int64_t profiler_lister_id = 0; static bool should_send_profile_state = false; std::mutex profiler_mu; @@ -52,42 +52,15 @@ static uint32_t g_next_thread_id = 0; // The global mutex static std::mutex g_all_event_lists_mutex; // The total event lists of all threads -static std::list> g_all_event_lists; +static std::list>> g_all_event_lists; // The thread local event list only can be accessed by the specific thread -static thread_local std::shared_ptr g_event_list; - -struct EventList { - constexpr static size_t kMB = 1024 * 1024; - constexpr static size_t kEventBlockSize = 16 * kMB; - constexpr static size_t kEventSize = sizeof(Event); - constexpr static size_t kEventAlign = alignof(Event); - constexpr static size_t kNumBlock = - kEventBlockSize / - ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); - - template - void Record(Args&&... args) { - if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) { - event_blocks.emplace_front(); - event_blocks.front().reserve(kNumBlock); - } - event_blocks.front().emplace_back(std::forward(args)...); - } - - std::vector Reduce() { - std::vector result; - for (auto& block : event_blocks) { - result.insert(result.begin(), std::make_move_iterator(block.begin()), - std::make_move_iterator(block.end())); - } - event_blocks.clear(); - return result; - } +static thread_local std::shared_ptr> g_event_list; - void Clear() { event_blocks.clear(); } - - std::forward_list> event_blocks; -}; +static std::list>> g_all_mem_event_lists; +static thread_local std::shared_ptr> g_mem_event_list; +static std::mutex g_all_mem_event_lists_mutex; +static thread_local int32_t g_mem_thread_id; +static uint32_t g_mem_next_thread_id = 0; inline uint64_t GetTimeInNsec() { using clock = std::conditionalGetPlace()) : false; - if (has_cuda_) { - auto* cuda_dev_ctx = static_cast(dev_ctx); - PADDLE_ENFORCE(cudaSetDevice( - boost::get(cuda_dev_ctx->GetPlace()).device)); - PADDLE_ENFORCE(cudaGetDevice(&device_)); - PADDLE_ENFORCE(cudaEventCreate(&event_)); - auto stream = cuda_dev_ctx->stream(); - PADDLE_ENFORCE(cudaEventRecord(event_, stream)); - } -#endif +Event::Event(EventType type, std::string name, uint32_t thread_id) + : type_(type), name_(name), thread_id_(thread_id) { cpu_ns_ = GetTimeInNsec(); } -const EventType& Event::type() const { return type_; } +const EventType &Event::type() const { return type_; } -double Event::CpuElapsedMs(const Event& e) const { +double Event::CpuElapsedMs(const Event &e) const { return (e.cpu_ns_ - cpu_ns_) / (1000000.0); } -double Event::CudaElapsedMs(const Event& e) const { -#ifdef PADDLE_WITH_CUDA - if (!has_cuda_) return 0.0; - PADDLE_ENFORCE(e.has_cuda() && has_cuda()); - PADDLE_ENFORCE(e.device() == device()); - PADDLE_ENFORCE(cudaEventSynchronize(event_)); - PADDLE_ENFORCE(cudaEventSynchronize(e.event())); - float ms; - PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event())); - return ms; +double Event::CudaElapsedMs(const Event &e) const { +#ifdef PADDLE_WITH_CUPTI + return gpu_ns_ / 1000000.0; #else - PADDLE_THROW("CUDA is not enabled"); + LOG_FIRST_N(WARNING, 1) << "CUDA CUPTI is not enabled"; + return 0; #endif } -#ifdef PADDLE_WITH_CUDA -static void ForEachDevice(std::function func) { - auto original_device = GetCurrentDeviceId(); - int count = GetCUDADeviceCount(); - for (int i = 0; i < count; i++) { - SetDeviceId(i); - func(i); +inline EventList &GetMemEventList() { + if (!g_mem_event_list) { + g_mem_event_list = std::make_shared>(); + std::lock_guard guard(g_all_mem_event_lists_mutex); + g_mem_thread_id = g_mem_next_thread_id++; + g_all_mem_event_lists.emplace_front(g_mem_event_list); } - SetDeviceId(original_device); + return *g_mem_event_list; } -#endif -inline EventList& GetEventList() { +void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, + const Place &place, const std::string &annotation) { + GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes, + place, g_mem_thread_id, annotation); +} + +void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, + const Place &place, const std::string &annotation) { + GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place, + g_mem_thread_id, annotation); +} + +inline EventList &GetEventList() { if (!g_event_list) { std::lock_guard guard(g_all_event_lists_mutex); - g_event_list = std::make_shared(); + g_event_list = std::make_shared>(); g_thread_id = g_next_thread_id++; g_all_event_lists.emplace_front(g_event_list); + RecoreCurThreadId(g_thread_id); } return *g_event_list; } -void Mark(const std::string& name, const DeviceContext* dev_ctx) { - GetEventList().Record(EventType::kMark, name, g_thread_id, dev_ctx); +void Mark(const std::string &name) { + GetEventList().Record(EventType::kMark, name, g_thread_id); } -void PushEvent(const std::string& name, const DeviceContext* dev_ctx) { - GetEventList().Record(EventType::kPushRange, name, g_thread_id, dev_ctx); +Event *PushEvent(const std::string &name) { + return GetEventList().Record(EventType::kPushRange, name, g_thread_id); } -void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { - GetEventList().Record(EventType::kPopRange, name, g_thread_id, dev_ctx); +void PopEvent(const std::string &name) { + GetEventList().Record(EventType::kPopRange, name, g_thread_id); } -RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) +RecordEvent::RecordEvent(const std::string &name) : is_enabled_(false), start_ns_(PosixInNsec()) { if (g_state == ProfilerState::kDisabled) return; - std::lock_guard l(profiler_mu); + // lock is not needed, the code below is thread-safe is_enabled_ = true; - dev_ctx_ = dev_ctx; name_ = name; - PushEvent(name_, dev_ctx_); + Event *e = PushEvent(name_); // Maybe need the same push/pop behavior. - SetCurAnnotation(name_); + SetCurAnnotation(e); } RecordEvent::~RecordEvent() { if (g_state == ProfilerState::kDisabled || !is_enabled_) return; - std::lock_guard l(profiler_mu); - DeviceTracer* tracer = GetDeviceTracer(); + // lock is not needed, the code below is thread-safe + DeviceTracer *tracer = GetDeviceTracer(); if (tracer) { - tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(), + tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(), BlockDepth(), g_thread_id); } ClearCurAnnotation(); - PopEvent(name_, dev_ctx_); + PopEvent(name_); +} + +MemEvenRecorder MemEvenRecorder::recorder; + +void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place, + size_t size) { + if (g_state == ProfilerState::kDisabled) return; + std::lock_guard guard(mtx_); + auto &events = address_memevent_[place]; + PADDLE_ENFORCE(events.count(ptr) == 0, ""); + events.emplace(ptr, std::unique_ptr( + new MemEvenRecorder::RecordMemEvent(place, size))); +} + +void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) { + if (g_state == ProfilerState::kDisabled) return; + std::lock_guard guard(mtx_); + auto &events = address_memevent_[place]; + auto iter = events.find(ptr); + // The ptr maybe not in address_memevent + if (iter != events.end()) { + events.erase(iter); + } } -RecordRPCEvent::RecordRPCEvent(const std::string& name, - const DeviceContext* dev_ctx) { +void MemEvenRecorder::Flush() { + std::lock_guard guard(mtx_); + address_memevent_.clear(); +} + +MemEvenRecorder::RecordMemEvent::RecordMemEvent(const Place &place, + size_t bytes) + : place_(place), + bytes_(bytes), + start_ns_(PosixInNsec()), + alloc_in_(CurAnnotationName()) { + PushMemEvent(start_ns_, end_ns_, bytes_, place_, alloc_in_); +} + +MemEvenRecorder::RecordMemEvent::~RecordMemEvent() { + DeviceTracer *tracer = GetDeviceTracer(); + end_ns_ = PosixInNsec(); + + auto annotation_free = CurAnnotationName(); + if (tracer) { + tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_, + annotation_free, g_mem_thread_id); + } + PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free); +} + +RecordRPCEvent::RecordRPCEvent(const std::string &name) { if (FLAGS_enable_rpc_profiler) { - event_.reset(new platform::RecordEvent(name, dev_ctx)); + event_.reset(new platform::RecordEvent(name)); } } RecordBlock::RecordBlock(int block_id) : is_enabled_(false), start_ns_(PosixInNsec()) { - std::lock_guard l(profiler_mu); + // lock is not needed, the code below is thread-safe if (g_state == ProfilerState::kDisabled) return; is_enabled_ = true; SetCurBlock(block_id); @@ -213,9 +225,9 @@ RecordBlock::RecordBlock(int block_id) } RecordBlock::~RecordBlock() { - std::lock_guard l(profiler_mu); + // lock is not needed, the code below is thread-safe if (g_state == ProfilerState::kDisabled || !is_enabled_) return; - DeviceTracer* tracer = GetDeviceTracer(); + DeviceTracer *tracer = GetDeviceTracer(); if (tracer) { // We try to put all blocks at the same nested depth in the // same timeline lane. and distinguish the using thread_id. @@ -225,11 +237,21 @@ RecordBlock::~RecordBlock() { ClearCurBlock(); } +void SynchronizeAllDevice() { +#ifdef PADDLE_WITH_CUDA + int count = GetCUDADeviceCount(); + for (int i = 0; i < count; i++) { + SetDeviceId(i); + PADDLE_ENFORCE(cudaDeviceSynchronize()); + } +#endif +} + void EnableProfiler(ProfilerState state) { PADDLE_ENFORCE(state != ProfilerState::kDisabled, "Can't enable profiling, since the input state is ", "ProfilerState::kDisabled"); - + SynchronizeAllDevice(); std::lock_guard l(profiler_mu); if (state == g_state) { return; @@ -238,28 +260,30 @@ void EnableProfiler(ProfilerState state) { should_send_profile_state = true; GetDeviceTracer()->Enable(); #ifdef PADDLE_WITH_CUDA - if (g_state == ProfilerState::kCUDA) { + if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll || + g_state == ProfilerState::kCPU) { // Generate some dummy events first to reduce the startup overhead. - for (int i = 0; i < 5; i++) { - ForEachDevice([](int d) { - DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d)); - Mark("_cuda_startup_", dev_ctx); - dev_ctx->Wait(); - delete dev_ctx; - }); - } + DummyKernelAndEvent(); + GetDeviceTracer()->Reset(); } #endif // Mark the profiling start. - Mark("_start_profiler_", nullptr); + Mark("_start_profiler_"); } void ResetProfiler() { + SynchronizeAllDevice(); + GetDeviceTracer()->Reset(); + MemEvenRecorder::Instance().Flush(); std::lock_guard guard(g_all_event_lists_mutex); for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); ++it) { (*it)->Clear(); } + for (auto it = g_all_mem_event_lists.begin(); + it != g_all_mem_event_lists.end(); ++it) { + (*it)->Clear(); + } } std::vector> GetAllEvents() { @@ -272,20 +296,31 @@ std::vector> GetAllEvents() { return result; } +std::vector> GetMemEvents() { + std::lock_guard guard(g_all_mem_event_lists_mutex); + std::vector> result; + for (auto &it : g_all_mem_event_lists) { + result.emplace_back((*it).Reduce()); + } + return result; +} + // The information of each event given in the profiling report struct EventItem { std::string name; int calls; double total_time; - double min_time; double max_time; double ave_time; + double min_time; + double cpu_time; + double gpu_time; float ratio; }; // Print results -void PrintProfiler(const std::vector>& events_table, - const std::string& sorted_domain, const size_t name_width, +void PrintProfiler(const std::vector> &events_table, + const std::string &sorted_domain, const size_t name_width, const size_t data_width, bool merge_thread) { // Output header information std::cout << "\n------------------------->" @@ -313,17 +348,31 @@ void PrintProfiler(const std::vector>& events_table, // Output events table std::cout.setf(std::ios::left); std::cout << std::setw(name_width) << "Event" << std::setw(data_width) - << "Calls" << std::setw(data_width) << "Total" - << std::setw(data_width) << "Min." << std::setw(data_width) + << "Calls" << std::setw(data_width) << "Total"; + if (g_state == ProfilerState::kAll) { + std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)" + << std::setw(data_width * 2) << "GPU Time (Ratio)"; + } + std::cout << std::setw(data_width) << "Min." << std::setw(data_width) << "Max." << std::setw(data_width) << "Ave." << std::setw(data_width) << "Ratio." << std::endl; for (size_t i = 0; i < events_table.size(); ++i) { for (size_t j = 0; j < events_table[i].size(); ++j) { - const EventItem& event_item = events_table[i][j]; + const EventItem &event_item = events_table[i][j]; std::cout << std::setw(name_width) << event_item.name << std::setw(data_width) << event_item.calls - << std::setw(data_width) << event_item.total_time - << std::setw(data_width) << event_item.min_time + << std::setw(data_width) << event_item.total_time; + if (g_state == ProfilerState::kAll) { + std::cout << std::setw(data_width * 2) + << string::Sprintf( + "%f (%f)", event_item.cpu_time, + (event_item.cpu_time / event_item.total_time)) + << std::setw(data_width * 2) + << string::Sprintf( + "%f (%f)", event_item.gpu_time, + (event_item.gpu_time / event_item.total_time)); + } + std::cout << std::setw(data_width) << event_item.min_time << std::setw(data_width) << event_item.max_time << std::setw(data_width) << event_item.ave_time << std::setw(data_width) << event_item.ratio << std::endl; @@ -333,50 +382,62 @@ void PrintProfiler(const std::vector>& events_table, } // Parse the event list and output the profiling report -void ParseEvents(const std::vector>& events, +void ParseEvents(const std::vector> &events, bool merge_thread, EventSortingKey sorted_by = EventSortingKey::kDefault) { if (g_state == ProfilerState::kDisabled) return; if (merge_thread && events.size() < 2) return; std::string sorted_domain; - std::function sorted_func; + std::function sorted_func; switch (sorted_by) { case EventSortingKey::kCalls: sorted_domain = "number of calls"; - sorted_func = [](const EventItem& a, const EventItem& b) { + sorted_func = [](const EventItem &a, const EventItem &b) { return a.calls > b.calls; }; break; case EventSortingKey::kTotal: sorted_domain = "total time"; - sorted_func = [](const EventItem& a, const EventItem& b) { + sorted_func = [](const EventItem &a, const EventItem &b) { return a.total_time > b.total_time; }; break; case EventSortingKey::kMin: sorted_domain = "minimum time"; - sorted_func = [](const EventItem& a, const EventItem& b) { + sorted_func = [](const EventItem &a, const EventItem &b) { return a.min_time > b.min_time; }; break; case EventSortingKey::kMax: sorted_domain = "maximum time"; - sorted_func = [](const EventItem& a, const EventItem& b) { + sorted_func = [](const EventItem &a, const EventItem &b) { return a.max_time > b.max_time; }; break; case EventSortingKey::kAve: sorted_domain = "average time"; - sorted_func = [](const EventItem& a, const EventItem& b) { + sorted_func = [](const EventItem &a, const EventItem &b) { return a.ave_time > b.ave_time; }; break; + case EventSortingKey::kGPUTime: + sorted_domain = "average time"; + sorted_func = [](const EventItem &a, const EventItem &b) { + return a.gpu_time > b.gpu_time; + }; + break; + case EventSortingKey::kCPUTime: + sorted_domain = "average time"; + sorted_func = [](const EventItem &a, const EventItem &b) { + return a.cpu_time > b.cpu_time; + }; + break; default: sorted_domain = "event first end time"; } - const std::vector>* analyze_events; + const std::vector> *analyze_events; std::vector> merged_events_list; if (merge_thread) { std::vector merged_events; @@ -410,10 +471,17 @@ void ParseEvents(const std::vector>& events, } if (rit != pushed_events.rend()) { - double event_time = (g_state == ProfilerState::kCUDA || - g_state == ProfilerState::kAll) - ? rit->CudaElapsedMs((*analyze_events)[i][j]) - : rit->CpuElapsedMs((*analyze_events)[i][j]); + double event_time = 0; + double gpu_time = rit->CudaElapsedMs((*analyze_events)[i][j]); + double cpu_time = rit->CpuElapsedMs((*analyze_events)[i][j]); + if (g_state == ProfilerState::kCUDA) { + event_time = gpu_time; + } else if (g_state == ProfilerState::kCPU) { + event_time = cpu_time; + } else { + event_time = gpu_time + cpu_time; + } + total += event_time; std::string event_name; @@ -430,7 +498,7 @@ void ParseEvents(const std::vector>& events, event_idx[event_name] = event_items.size(); EventItem event_item = {event_name, 1, event_time, event_time, event_time, event_time, - 0.}; + gpu_time, cpu_time, 0.}; event_items.push_back(event_item); } else { int index = event_idx[event_name]; @@ -443,6 +511,8 @@ void ParseEvents(const std::vector>& events, // max time event_items[index].max_time = std::max(event_time, event_items[index].max_time); + event_items[index].gpu_time += gpu_time; + event_items[index].cpu_time += cpu_time; } // remove the push marker from the list @@ -455,7 +525,7 @@ void ParseEvents(const std::vector>& events, } } // average time - for (auto& item : event_items) { + for (auto &item : event_items) { item.ave_time = item.total_time / item.calls; item.ratio = item.total_time / total; } @@ -479,22 +549,92 @@ void ParseEvents(const std::vector>& events, merge_thread); } +struct MemoryProfierReport { + size_t alloc_times{0}; + size_t alloc_size{0}; + size_t free_times{0}; + size_t free_size{0}; +}; + +// Print results +void PrintMemProfiler( + const std::map> + &annotation_report, + const size_t name_width, const size_t data_width) { + // Output header information + std::cout << "\n------------------------->" + << " Memory Profiling Report " + << "<-------------------------\n\n"; + + // Output events table + std::cout.setf(std::ios::left); + std::cout << std::setw(name_width) << "Event" << std::setw(data_width) + << "Alloc Calls" << std::setw(data_width) << "Size(MB)" + << std::setw(data_width) << "Free Calls" << std::setw(data_width) + << "Size(MB)" << std::endl; + + for (auto &tmp : annotation_report) { + for (auto &e : tmp.second) { + auto event_name = string::Sprintf("%s:%s", tmp.first, e.first); + std::cout << std::setw(name_width) << event_name; + std::cout << std::setw(data_width) << e.second.alloc_times; + std::cout << std::setw(data_width) + << e.second.alloc_size / (1024.0 * 1024.0); + std::cout << std::setw(data_width) << e.second.free_times; + std::cout << std::setw(data_width) + << e.second.free_size / (1024.0 * 1024.0) << std::endl; + } + } + std::cout << std::endl; +} + +// parse memory events +void ParseMemEvents(const std::vector> &events) { + if (g_state == ProfilerState::kDisabled) return; + // place, annotation, alloc times, alloc size + std::map> + annotation_report; + + for (auto &tmp : events) { + for (auto &e : tmp) { + if (e.type() == EventType::kPushRange) { + annotation_report[e.place()][e.annotation()].alloc_times += 1; + annotation_report[e.place()][e.annotation()].alloc_size += e.bytes(); + } else if (e.type() == EventType::kPopRange) { + annotation_report[e.place()][e.annotation()].free_times += 1; + annotation_report[e.place()][e.annotation()].free_size += e.bytes(); + } + } + } + PrintMemProfiler(annotation_report, 55, 18); +} + void DisableProfiler(EventSortingKey sorted_key, - const std::string& profile_path) { + const std::string &profile_path) { + SynchronizeAllDevice(); + MemEvenRecorder::Instance().Flush(); + std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled) return; // Mark the profiling stop. - Mark("_stop_profiler_", nullptr); + Mark("_stop_profiler_"); - std::vector> all_events = GetAllEvents(); - ParseEvents(all_events, true, sorted_key); - ParseEvents(all_events, false, sorted_key); - ResetProfiler(); - DeviceTracer* tracer = GetDeviceTracer(); + DeviceTracer *tracer = GetDeviceTracer(); if (tracer->IsEnabled()) { tracer->Disable(); tracer->GenProfile(profile_path); + tracer->GenEventKernelCudaElapsedTime(); } + + std::vector> all_events = GetAllEvents(); + ParseEvents(all_events, true, sorted_key); + ParseEvents(all_events, false, sorted_key); + if (VLOG_IS_ON(5)) { + std::vector> all_mem_events = GetMemEvents(); + ParseMemEvents(all_mem_events); + } + + ResetProfiler(); g_state = ProfilerState::kDisabled; should_send_profile_state = true; } diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu new file mode 100644 index 0000000000000000000000000000000000000000..aed276b16e95f954539d3fadac65309314ed34f1 --- /dev/null +++ b/paddle/fluid/platform/profiler.cu @@ -0,0 +1,50 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace platform { + +__global__ void DummyKernel(int *a) { a[0] = 0; } + +static void ForEachDevice(std::function func) { + auto original_device = platform::GetCurrentDeviceId(); + int count = platform::GetCUDADeviceCount(); + for (int i = 0; i < count; i++) { + platform::SetDeviceId(i); + func(i); + } + platform::SetDeviceId(original_device); +} + +void DummyKernelAndEvent() { + for (int i = 0; i < 5; i++) { + ForEachDevice([](int d) { + platform::SetDeviceId(d); + cudaStream_t stream; + PADDLE_ENFORCE(cudaStreamCreate(&stream)); + Mark("_cuda_startup_"); + int *ptr; + PADDLE_ENFORCE(cudaMalloc(&ptr, sizeof(int))); + DummyKernel<<<1, 1, 0, stream>>>(ptr); + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + PADDLE_ENFORCE(cudaFree(ptr)); + }); + } +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index f5d3490634f3199a23986ec3ae13d9fe3577ac35..8d11855b70de824159f19f2997b876564e7719b1 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -15,46 +15,22 @@ limitations under the License. */ #pragma once #include #include +#include +#include +#include // NOLINT #include +#include +#include +#include #include -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace platform { - -enum EventType { kMark, kPushRange, kPopRange }; - -class Event { - public: - // The DeviceContext is used to get the cuda stream. - // If CPU profiling mode, can pass nullptr. - Event(EventType type, std::string name, uint32_t thread_id, - const DeviceContext* dev_ctx); - - const EventType& type() const; - std::string name() const { return name_; } - uint32_t thread_id() const { return thread_id_; } - bool has_cuda() const { return has_cuda_; } - -#ifdef PADDLE_WITH_CUDA - cudaEvent_t event() const { return event_; } - int device() const { return device_; } -#endif - - double CpuElapsedMs(const Event& e) const; - double CudaElapsedMs(const Event& e) const; - - private: - EventType type_; - std::string name_; - uint32_t thread_id_; - int64_t cpu_ns_; - bool has_cuda_; +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/event.h" +#include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA - cudaEvent_t event_ = nullptr; - int device_ = -1; +#include "paddle/fluid/platform/gpu_info.h" #endif -}; +namespace paddle { +namespace platform { enum ProfilerState { kDisabled, // disabled state @@ -63,22 +39,52 @@ enum ProfilerState { kAll, // Profile both CPU and GPU. (Currently experimental). }; -void Mark(const std::string& name, const DeviceContext* dev_ctx); +void Mark(const std::string& name); + +void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, + const Place& place); +void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, + const Place& place); -void PushEvent(const std::string& name, const DeviceContext* dev_ctx); +struct MemEvenRecorder { + public: + void PushMemRecord(const void* ptr, const Place& place, size_t size); + void PopMemRecord(const void* ptr, const Place& place); + void Flush(); + static MemEvenRecorder& Instance() { return recorder; } + + private: + struct RecordMemEvent { + RecordMemEvent(const Place& place, size_t bytes); + ~RecordMemEvent(); + + Place place_; + size_t bytes_; + uint64_t start_ns_; + uint64_t end_ns_; + std::string alloc_in_; + std::string free_in_; + }; + + static MemEvenRecorder recorder; + std::map>> + address_memevent_; + std::mutex mtx_; + MemEvenRecorder() {} + DISABLE_COPY_AND_ASSIGN(MemEvenRecorder); +}; -void PopEvent(const std::string& name, const DeviceContext* dev_ctx); +Event* PushEvent(const std::string& name); +void PopEvent(const std::string& name); struct RecordEvent { - // dev_ctx can be set to nullptr if device is cpu. - RecordEvent(const std::string& name, const DeviceContext* dev_ctx); + explicit RecordEvent(const std::string& name); ~RecordEvent(); bool is_enabled_; uint64_t start_ns_; - // The device context is used by Event to get the current cuda stream. - const DeviceContext* dev_ctx_; // Event name std::string name_; // Need to distinguish name by op type, block_id, program_id and perhaps @@ -88,8 +94,7 @@ struct RecordEvent { class RecordRPCEvent { public: - // dev_ctx can be set to nullptr if device is cpu. - RecordRPCEvent(const std::string& name, const DeviceContext* dev_ctx); + explicit RecordRPCEvent(const std::string& name); ~RecordRPCEvent() {} private: @@ -111,7 +116,51 @@ struct RecordBlock { std::vector> GetAllEvents(); // Candidate keys to sort the profiling report -enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve }; +enum EventSortingKey { + kDefault, + kCalls, + kTotal, + kMin, + kMax, + kAve, + kCPUTime, + kGPUTime +}; + +template +struct EventList { + constexpr static size_t kMB = 1024 * 1024; + constexpr static size_t kEventBlockSize = 16 * kMB; + constexpr static size_t kEventSize = sizeof(T); + constexpr static size_t kEventAlign = alignof(T); + constexpr static size_t kNumBlock = + kEventBlockSize / + ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); + + template + T* Record(Args&&... args) { + if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) { + event_blocks.emplace_front(); + event_blocks.front().reserve(kNumBlock); + } + event_blocks.front().emplace_back(std::forward(args)...); + return &event_blocks.front().back(); + } + + std::vector Reduce() { + std::vector result; + for (auto& block : event_blocks) { + result.insert(result.begin(), std::make_move_iterator(block.begin()), + std::make_move_iterator(block.end())); + } + event_blocks.clear(); + return result; + } + + void Clear() { event_blocks.clear(); } + + std::forward_list> event_blocks; +}; // Enable the profiling function. void EnableProfiler(ProfilerState state); @@ -132,5 +181,9 @@ bool ShouldSendProfileState(); void SetProfileListener(); int64_t ListenerId(); +#ifdef PADDLE_WITH_CUDA +void DummyKernelAndEvent(); +#endif + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto index 7b42aa785ec6ad5731e3adee1e9f189127a826a1..cfa3c6906f83f750c8d6dc654f29b8fe95ec17ac 100644 --- a/paddle/fluid/platform/profiler.proto +++ b/paddle/fluid/platform/profiler.proto @@ -31,10 +31,28 @@ message Event { optional int64 sub_device_id = 6; optional MemCopy memcopy = 7; + optional string detail_info = 9; +} + +message MemEvent { + enum Place { + CUDAPlace = 0; + CPUPlace = 1; + CUDAPinnedPlace = 2; + } + optional uint64 start_ns = 1; + optional uint64 end_ns = 2; + optional uint64 bytes = 3; + optional Place place = 4; + optional uint64 thread_id = 5; + optional uint32 device_id = 6; + optional string alloc_in = 7; + optional string free_in = 8; } message Profile { repeated Event events = 1; optional uint64 start_ns = 2; optional uint64 end_ns = 3; + repeated MemEvent mem_events = 4; } \ No newline at end of file diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc index 61f467814ba4a24c8b73f1bc614cda0ab8c4debd..a851488e72d27dfcbd04546d9b531d26257f611c 100644 --- a/paddle/fluid/platform/profiler_test.cc +++ b/paddle/fluid/platform/profiler_test.cc @@ -23,76 +23,48 @@ TEST(Event, CpuElapsedTime) { using paddle::platform::Event; using paddle::platform::EventType; - Event start_event(EventType::kPushRange, "test", 0, nullptr); - EXPECT_TRUE(start_event.has_cuda() == false); + Event start_event(EventType::kPushRange, "test", 0); int counter = 0; while (counter != 1000) { counter++; } - Event stop_event(EventType::kPopRange, "test", 0, nullptr); + Event stop_event(EventType::kPopRange, "test", 0); EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0); } -#ifdef PADDLE_WITH_CUDA -TEST(Event, CudaElapsedTime) { - using paddle::platform::DeviceContext; - using paddle::platform::CUDADeviceContext; - using paddle::platform::CUDAPlace; - using paddle::platform::Event; - using paddle::platform::EventType; - - DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0)); - Event start_event(EventType::kPushRange, "test", 0, dev_ctx); - EXPECT_TRUE(start_event.has_cuda() == true); - int counter = 0; - while (counter != 1000) { - counter++; - } - Event stop_event(EventType::kPopRange, "test", 0, dev_ctx); - EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0); -} -#endif - TEST(RecordEvent, RecordEvent) { - using paddle::platform::DeviceContext; using paddle::platform::Event; using paddle::platform::EventType; using paddle::platform::RecordEvent; + using paddle::platform::PushEvent; + using paddle::platform::PopEvent; using paddle::platform::ProfilerState; using paddle::platform::EventSortingKey; ProfilerState state = ProfilerState::kCPU; - DeviceContext* dev_ctx = nullptr; -#ifdef PADDLE_WITH_CUDA - using paddle::platform::CUDADeviceContext; - using paddle::platform::CUDAPlace; - state = ProfilerState::kCUDA; - dev_ctx = - new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0)); -#endif EnableProfiler(state); /* Usage 1: - * PushEvent(evt_name, dev_ctx); + * PushEvent(evt_name); * ... * code to be analyzed * ... - * PopEvent(evt_name, dev_ctx); + * PopEvent(evt_name); */ LOG(INFO) << "Usage 1: PushEvent & PopEvent"; for (int loop = 0; loop < 3; ++loop) { for (int i = 1; i < 5; ++i) { std::string name = "op_" + std::to_string(i); - PushEvent(name, dev_ctx); + PushEvent(name); int counter = 1; while (counter != i * 1000) counter++; - PopEvent(name, dev_ctx); + PopEvent(name); } } /* Usage 2: * { - * RecordEvent record_event(name, dev_ctx); + * RecordEvent record_event(name); * ... * code to be analyzed * ... @@ -101,7 +73,7 @@ TEST(RecordEvent, RecordEvent) { LOG(INFO) << "Usage 2: RecordEvent"; for (int i = 1; i < 5; ++i) { std::string name = "evs_op_" + std::to_string(i); - RecordEvent record_event(name, dev_ctx); + RecordEvent record_event(name); int counter = 1; while (counter != i * 1000) counter++; } @@ -123,20 +95,20 @@ TEST(RecordEvent, RecordEvent) { LOG(INFO) << "Usage 3: nested RecordEvent"; for (int i = 1; i < 5; ++i) { std::string name = "ano_evs_op_" + std::to_string(i); - RecordEvent record_event(name, dev_ctx); + RecordEvent record_event(name); int counter = 1; while (counter != i * 100) counter++; { std::string nested_name = "nested_ano_evs_op_" + std::to_string(i); - RecordEvent nested_record_event(nested_name, dev_ctx); + RecordEvent nested_record_event(nested_name); int nested_counter = 1; while (nested_counter != i * 100) nested_counter++; } } // Bad Usage: - PushEvent("event_without_pop", dev_ctx); - PopEvent("event_without_push", dev_ctx); + PushEvent("event_without_pop"); + PopEvent("event_without_push"); std::vector> events = paddle::platform::GetAllEvents(); int cuda_startup_count = 0; diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc index 3879cd540017ea22b0cf4eee794a172e56716b74..6dae84f016e5db8007b4a4b4df2b5ed7f5cb4f19 100644 --- a/paddle/fluid/platform/temporary_allocator_test.cc +++ b/paddle/fluid/platform/temporary_allocator_test.cc @@ -141,7 +141,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = static_cast(pool.Get(cpu_place)); - framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx); + framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr); int numel = memory_size / sizeof(float); framework::Tensor tensor = @@ -156,7 +156,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = static_cast(pool.Get(gpu_place)); - framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx); + framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr); int numel = memory_size / sizeof(float); framework::Tensor tensor = ctx.AllocateTmpTensor( @@ -179,7 +179,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = static_cast(pool.Get(cpu_place)); - framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx); + framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr); int numel = memory_size / sizeof(float); framework::Tensor out_side_tensor; @@ -200,7 +200,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = static_cast(pool.Get(gpu_place)); - framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx); + framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr); size_t memory_size = 500; int numel = memory_size / sizeof(float); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 31c3bfa43ffec22059a602e9ff09a33188d72c91..6bbda69297a48ce27ce23282c4e08d49ee3cce6c 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -13,10 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/pybind/imperative.h" + +#include +#include +#include +#include + #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/type_defs.h" +#include "paddle/fluid/pybind/pybind_boost_headers.h" + namespace paddle { namespace pybind { @@ -31,21 +39,21 @@ void BindTracer(pybind11::module* m) { [](imperative::Tracer& self, imperative::OpBase* op, const imperative::VarBasePtrMap& inputs, const imperative::VarBasePtrMap& outputs, - framework::BlockDesc* block, + framework::AttributeMap attrs_map, const platform::CPUPlace expected_place, const bool stop_gradient = false) { - self.Trace(op, inputs, outputs, block, expected_place, - stop_gradient); + return self.Trace(op, inputs, outputs, attrs_map, expected_place, + stop_gradient); }) .def("trace", [](imperative::Tracer& self, imperative::OpBase* op, const imperative::VarBasePtrMap& inputs, const imperative::VarBasePtrMap& outputs, - framework::BlockDesc* block, + framework::AttributeMap attrs_map, const platform::CUDAPlace expected_place, const bool stop_gradient = false) { - self.Trace(op, inputs, outputs, block, expected_place, - stop_gradient); + return self.Trace(op, inputs, outputs, attrs_map, expected_place, + stop_gradient); }) .def("py_trace", &imperative::Tracer::PyTrace, pybind11::return_value_policy::take_ownership); diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h index f947b743f99d5d4994b1a87f89fd6815357d8125..8496cbfcb18798ee8ce1714431b7877bb2b7d377 100644 --- a/paddle/fluid/pybind/imperative.h +++ b/paddle/fluid/pybind/imperative.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include +#include #include #include "paddle/fluid/imperative/layer.h" #include "pybind11/pybind11.h" @@ -33,9 +34,11 @@ class Layer : public imperative::Layer { } }; -class PyOpBase : public imperative::OpBase { +class PYBIND11_HIDDEN PyOpBase : public imperative::OpBase { public: using imperative::OpBase::OpBase; // Inherit constructors + + PyOpBase(const std::string& name) : OpBase(name) {} }; class PyVarBase : public imperative::VarBase { diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 7db2bb451b49918fd8d92a6036c132d34e965c63..236afc77f708c344665821edd4f7c7841c300465 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -65,7 +65,8 @@ void BindInferenceApi(py::module *m) { void BindPaddleDType(py::module *m) { py::enum_(*m, "PaddleDType") .value("FLOAT32", PaddleDType::FLOAT32) - .value("INT64", PaddleDType::INT64); + .value("INT64", PaddleDType::INT64) + .value("INT32", PaddleDType::INT32); } void BindPaddleBuf(py::module *m) { @@ -103,6 +104,11 @@ void BindPaddleBuf(py::module *m) { int64_t *data = static_cast(self.data()); return {data, data + self.length() / sizeof(*data)}; }) + .def("int32_data", + [](PaddleBuf &self) -> std::vector { + int32_t *data = static_cast(self.data()); + return {data, data + self.length() / sizeof(*data)}; + }) .def("length", &PaddleBuf::length); } @@ -221,7 +227,8 @@ void BindAnalysisConfig(py::module *m) { .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine, py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1, py::arg("min_subgraph_size") = 3, - py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32) + py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, + py::arg("use_static") = true) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug, py::arg("x") = true) diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index 1cd1be8e8d9da8c6a82ceefc3284084bfeda0252..c69ccd507210f976c1cb8ad072928b96693a948d 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -14,9 +14,11 @@ #include "paddle/fluid/pybind/ir.h" #include +#include #include #include #include +#include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" @@ -53,12 +55,14 @@ void BindGraph(py::module *m) { "The graph is a Directed Acyclic Single Static Assignment Graph, see " "`paddle::ir::Graph` for details.") .def(py::init()) + .def("clone", &Graph::Clone) .def("has", &Graph::Has) .def("get_int", &Graph::Get) .def("get_float", &Graph::Get) .def("get_double", &Graph::Get) .def("get_string", &Graph::Get) - .def("get_marked_nodes", &Graph::Get>) + .def("get_marked_nodes", &Graph::Get>, + return_value_policy::reference) .def("set", [](Graph &self, const std::string &attr_name, int attr) { return self.Set(attr_name, new int(attr)); }) .def("set", @@ -101,7 +105,9 @@ void BindGraph(py::module *m) { [](Graph &self, Node &node) { return self.RemoveNode(&node); }) .def("retrieve_node", &Graph::RetrieveNode, return_value_policy::reference) - .def("resolve_hazard", &Graph::ResolveHazard); + .def("resolve_hazard", &Graph::ResolveHazard) + .def("origin_program_desc", &Graph::OriginProgram, + return_value_policy::reference); } void BindNode(py::module *m) { @@ -115,7 +121,7 @@ void BindNode(py::module *m) { .def("is_var", &Node::IsVar) .def("is_ctrl_var", &Node::IsCtrlVar) .def("clear_inputs", [](Node &self) { self.inputs.clear(); }) - .def("inputs_remove", + .def("remove_input", [](Node &self, int node_id) { auto pos = std::find_if( self.inputs.begin(), self.inputs.end(), @@ -124,7 +130,7 @@ void BindNode(py::module *m) { self.inputs.erase(pos); } }) - .def("inputs_remove", + .def("remove_input", [](Node &self, Node &node) { auto pos = std::find(self.inputs.begin(), self.inputs.end(), &node); @@ -132,10 +138,10 @@ void BindNode(py::module *m) { self.inputs.erase(pos); } }) - .def("inputs_append", + .def("append_input", [](Node &self, Node &node) { self.inputs.push_back(&node); }) .def("clear_outputs", [](Node &self) { self.outputs.clear(); }) - .def("outputs_remove", + .def("remove_output", [](Node &self, int node_id) { auto pos = std::find_if( self.outputs.begin(), self.outputs.end(), @@ -144,7 +150,7 @@ void BindNode(py::module *m) { self.outputs.erase(pos); } }) - .def("outputs_remove", + .def("remove_output", [](Node &self, Node &node) { auto pos = std::find(self.outputs.begin(), self.outputs.end(), &node); @@ -152,7 +158,7 @@ void BindNode(py::module *m) { self.outputs.erase(pos); } }) - .def("outputs_append", + .def("append_output", [](Node &self, Node &node) { self.outputs.push_back(&node); }) .def_readwrite("inputs", &Node::inputs) .def_readwrite("outputs", &Node::outputs); diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index e729be4a95a58510f1e0162af4216feaa400d971..7b5e417504fa16426279c8ed3c24d6d62e6be404 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -23,97 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/var_desc.h" -// Cast boost::variant for PyBind. -// Copy from -// https://github.com/pybind/pybind11/issues/576#issuecomment-269563199 -namespace pybind11 { -namespace detail { - -#if !defined(PYBIND11_HIDDEN) -#ifdef _WIN32 -#define PYBIND11_HIDDEN __declspec(dllexport) -#else -#define PYBIND11_HIDDEN __attribute__((visibility("hidden"))) -#endif -#endif - -// Can be replaced by a generic lambda in C++14 -struct PYBIND11_HIDDEN paddle_variant_caster_visitor - : public boost::static_visitor { - return_value_policy policy; - handle parent; - - paddle_variant_caster_visitor(return_value_policy policy, handle parent) - : policy(policy), parent(parent) {} - - template - handle operator()(T const &src) const { - return make_caster::cast(src, policy, parent); - } -}; - -template -struct paddle_variant_caster; - -template