diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e7ffe72b5fb846fb55ab8dc4809d87a40cfe06c..9ad69738eb2ac21d6ff2624f11d17a38410d5c1f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,8 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") +message(STATUS "AR tools: ${CMAKE_AR}") + if(WIN32) set(CMAKE_SUPPRESS_REGENERATION ON) set(CMAKE_STATIC_LIBRARY_PREFIX lib) @@ -62,6 +64,7 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(WITH_PSLIB "Compile with pslib support" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) +# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter. option(WITH_ANAKIN "Compile with Anakin library" OFF) option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF) option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON) @@ -188,7 +191,14 @@ include(configure) # add paddle env configuration if(WITH_GPU) include(cuda) include(tensorrt) + include(anakin_subgraph) +endif() + +if(WITH_GPU AND NOT WIN32) + message(STATUS "add dgc lib.") + include(external/dgc) endif() + if(WITH_MKL OR WITH_MKLML) include(external/anakin) elseif() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1304d6fe196c11a14a012b9f236b7a6682522e05..62b26b99bcbeddc91ed1bd0702b0d6aec2e674bf 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -156,7 +156,7 @@ python \ This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3. This suggests that we output overall messages in lower verbose levels, so they display with higher probability. When coding C++, please follow the verbose level convention as follows: -- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework) -- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) -- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform) -- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math) +- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework) +- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators) +- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform) +- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators/math/) diff --git a/Dockerfile b/Dockerfile index fe0721e9b99b5e028df2f6228ff04cb56a567a3f..c248ac119caa1f493e4866b02551eb900d3bf391 100644 --- a/Dockerfile +++ b/Dockerfile @@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh # and its size is only one-third of the official one. # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. -RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ - tar -xz -C /usr/local && \ + +RUN wget -q https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \ + tar -zxf TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz -C /usr/local && \ cp -rf /usr/local/TensorRT/include /usr && \ cp -rf /usr/local/TensorRT/lib /usr diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 10b633a4fc1063aab5c0d34b994f9c233e228f17..df159a334e86d62e175bce3b363b74ec78c1fd64 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -179,7 +179,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, else: build_strategy.reduce_strategy = fluid.BuildStrategy( ).ReduceStrategy.AllReduce - build_strategy.fuse_broadcast_op = args.fuse_broadcast_op avg_loss = train_args[0] diff --git a/cmake/anakin_subgraph.cmake b/cmake/anakin_subgraph.cmake new file mode 100644 index 0000000000000000000000000000000000000000..4a7d32a63553df31e0928e7b30249ff3e809cba1 --- /dev/null +++ b/cmake/anakin_subgraph.cmake @@ -0,0 +1,32 @@ +if(NOT WITH_GPU) + return() +endif() + +set(ANAKIN_ROOT "/usr" CACHE PATH "ANAKIN ROOT") +find_path(ANAKIN_INCLUDE_DIR anakin_config.h + PATHS ${ANAKIN_ROOT} ${ANAKIN_ROOT}/include + $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/include + NO_DEFAULT_PATH +) + +find_library(ANAKIN_LIBRARY NAMES libanakin_saber_common.so libanakin.so + PATHS ${ANAKIN_ROOT} + $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/lib + NO_DEFAULT_PATH + DOC "Path to ANAKIN library.") + +if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY) + if(WITH_DSO) + set(ANAKIN_FOUND ON) + endif(WITH_DSO) +else() + set(ANAKIN_FOUND OFF) +endif() + +if(ANAKIN_FOUND) + message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ") + include_directories(${ANAKIN_ROOT}/include) + include_directories(${ANAKIN_ROOT}/include/saber) + link_directories(${ANAKIN_ROOT}) + add_definitions(-DPADDLE_WITH_ANAKIN) +endif() diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake index fc204dc9193bb28b654936048dd61a9b461abb2f..ba8b5fc6c838b221fcfb559f1f01051fc09072a4 100644 --- a/cmake/external/boost.cmake +++ b/cmake/external/boost.cmake @@ -24,7 +24,7 @@ set(BOOST_PROJECT "extern_boost") # So we use 1.41.0 here. set(BOOST_VER "1.41.0") set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE) -set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE) +set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE) MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}") diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake new file mode 100644 index 0000000000000000000000000000000000000000..199ca88b47754638d5e93043e078d552261dc088 --- /dev/null +++ b/cmake/external/dgc.cmake @@ -0,0 +1,42 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc") +SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc") +SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE) +SET(DGC_LIBRARIES "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE) +INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR}) + +ExternalProject_Add( + extern_dgc + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/PaddlePaddle/Fleet" + GIT_TAG "2d04dc3800cdd0601f1b65d547dabcc60b0cf9dc" + SOURCE_DIR "${DGC_SOURCES_DIR}" + CONFIGURE_COMMAND "" + BUILD_COMMAND cd collective && make -j + INSTALL_COMMAND mkdir -p ${DGC_INSTALL_DIR}/lib/ ${DGC_INCLUDE_DIR}/dgc + && cp ${DGC_SOURCES_DIR}/collective/build/lib/libdgc.a ${DGC_LIBRARIES} + && cp ${DGC_SOURCES_DIR}/collective/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/ + BUILD_IN_SOURCE 1 +) + +ADD_LIBRARY(dgc SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET dgc PROPERTY IMPORTED_LOCATION ${DGC_LIBRARIES}) +ADD_DEPENDENCIES(dgc extern_dgc) + +LIST(APPEND external_project_dependencies dgc) + diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index c5754da59bf2053931be413eb10c481adecbae6b..d96da470b3cbbd8092dbf80ec5f500af9afa2ce4 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -44,7 +44,7 @@ ExternalProject_Add( # 3. keep only zlib, cares, protobuf, boringssl under "third_party", # checkout and clean other dirs under third_party # 4. remove .git, and package the directory. - URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz" + URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz" URL_MD5 "1f268a2aff6759839dccd256adcc91cf" PREFIX ${GRPC_SOURCES_DIR} UPDATE_COMMAND "" diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 94a266c50114a94d125467d55a6367a6999e3298..b1e437a9007072c82ab375bf5ed79fc7d6c80c47 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -31,9 +31,17 @@ IF(APPLE) return() ENDIF() -MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path") +# Introduce variables: +# * CMAKE_INSTALL_LIBDIR +INCLUDE(GNUInstallDirs) +SET(LIBDIR "lib") +if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$") + SET(LIBDIR "lib64") +endif() + +MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/l${LIBDIR} to runtime path") SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/${LIBDIR}") INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers. @@ -58,7 +66,7 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${MKLDNN_DEPENDS} GIT_REPOSITORY "https://github.com/intel/mkl-dnn.git" - GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a" + GIT_TAG "863ff6e7042cec7d2e29897fe9f0872e0888b0fc" PREFIX ${MKLDNN_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} @@ -79,9 +87,9 @@ ExternalProject_Add( -DMKLROOT:PATH=${MKLML_ROOT} ) if(WIN32) - SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE) + SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE) else(WIN32) - SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) + SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) endif(WIN32) ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL) @@ -101,7 +109,7 @@ ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) # copy the real so.0 lib to install dir # it can be directly contained in wheel or capi if(WIN32) - SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/lib/mkldnn.dll) + SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll) else(WIN32) SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index ae2679db4aed7a77ad407f881c4482fd3914ac27..142fce816de4f06aa0a36b91e3e4ecb962a8dc2a 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -34,7 +34,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") SET(TIME_VERSION "2019.0.1.20181227") IF(WIN32) SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE) - SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE) + SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE) SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) @@ -43,7 +43,7 @@ ELSE() #TODO(intel-huying): # Now enable Erf function in mklml library temporarily, it will be updated as offical version later. SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE) - SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) + SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake index e7fb69dbbc872c813b2eba16a5b1098eebfeedd8..23998b497e7a796b5487a287163f98a28e8d63d7 100644 --- a/cmake/external/ngraph.cmake +++ b/cmake/external/ngraph.cmake @@ -57,20 +57,25 @@ SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME}) ExternalProject_Add( ${NGRAPH_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} - DEPENDS ${MKLDNN_PROJECT} ${MKLML_PROJECT} - GIT_REPOSITORY ${NGRAPH_GIT_REPO} - GIT_TAG ${NGRAPH_GIT_TAG} - PREFIX ${NGRAPH_SOURCES_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR} - CMAKE_ARGS -DNGRAPH_UNIT_TEST_ENABLE=FALSE - CMAKE_ARGS -DNGRAPH_TOOLS_ENABLE=FALSE - CMAKE_ARGS -DNGRAPH_INTERPRETER_ENABLE=FALSE - CMAKE_ARGS -DNGRAPH_DEX_ONLY=TRUE - CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR} - CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR} - CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib + DEPENDS ${MKLDNN_PROJECT} ${MKLML_PROJECT} + GIT_REPOSITORY ${NGRAPH_GIT_REPO} + GIT_TAG ${NGRAPH_GIT_TAG} + PREFIX ${NGRAPH_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_GENERATOR ${CMAKE_GENERATOR} + CMAKE_GENERATOR_PLATFORM ${CMAKE_GENERATOR_PLATFORM} + CMAKE_GENERATOR_TOOLSET ${CMAKE_GENERATOR_TOOLSET} + CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR} + CMAKE_ARGS -DNGRAPH_UNIT_TEST_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_TOOLS_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_INTERPRETER_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_DEX_ONLY=TRUE + CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR} + CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR} + CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib ) add_dependencies(ngraph ${NGRAPH_PROJECT}) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index a7dce4dfdb530b13bea9df128694f0946714ccff..b7c32f80db0dcb826f3f67ffb55da1c715785add 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -131,6 +131,15 @@ elseif (NOT CBLAS_FOUND OR WIN32) ) endif () +if (WITH_GPU AND NOT WIN32) + set(dgc_dir "${FLUID_INSTALL_DIR}/third_party/install/dgc") + copy(dgc_lib + SRCS ${DGC_INSTALL_DIR}/lib ${DGC_INSTALL_DIR}/include + DSTS ${dgc_dir} ${dgc_dir} + DEPS dgc) +endif() + + if (WITH_MKLDNN) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn") copy(mkldnn_lib diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 11a5b1b4554e7899c3ee7092a9295234743750d7..c17e718f4279f24c85db8be1177e5b5e82b13e08 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -110,7 +110,7 @@ function(op_library TARGET) # Define operators that don't need pybind here. foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" -"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op") +"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) endif() diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake index 891ff222633741f9894c2fdb6c0096a48f8a35e1..3bf12094e4c32e69f908cbe6cefc7871fc9bb568 100644 --- a/cmake/tensorrt.cmake +++ b/cmake/tensorrt.cmake @@ -33,5 +33,6 @@ if(TENSORRT_FOUND) message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ") include_directories(${TENSORRT_INCLUDE_DIR}) + link_directories(${TENSORRT_LIBRARY}) add_definitions(-DPADDLE_WITH_TENSORRT) endif() diff --git a/paddle/contrib/float16/README.md b/paddle/contrib/float16/README.md index 58b4a50666bfb622af8acbce29355f2a4a870a82..a1f8cb42451dd5e84c97d6830216d284cc8bd819 100644 --- a/paddle/contrib/float16/README.md +++ b/paddle/contrib/float16/README.md @@ -5,13 +5,13 @@ Kexin Zhao ## Introduction Deep learning is usually a two-stage work: training and inference. The training stage estimates model parameters (weights) from data. The inference stage loads the weights and uses them to interpret inputs. Typically, weights are 32-bit float values (float32). Some new devices, including NVIDIA Volta GPUs, support higher speed computation using 16-bit float values (float16). -This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16. +This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16. ## What is float16? float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float or float32 data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has a high tolerance for the loss of precision and range when using float16 to represent the weights, and the inference accuracy will only be minimally affected in most cases, which gives us the opportunity to use float16 data type to speed up the inference. -Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type. +Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type. ## Why float16? The trend in today's deep learning community is to use bigger and deeper model, which translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float32 are correspondingly three-fold: @@ -24,12 +24,12 @@ The trend in today's deep learning community is to use bigger and deeper model, ## Fluid implementation of float16 inference ### Overview -Fluid use [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block. +Fluid use [Program](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block. ### Basic requirement When an executor runs an operator, it uses a kernel to perform computations on tensors contained in the input variables, and then writes the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs. -If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type. +If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type. The same principle applies if we want a program to run in float16 mode. We provide input variable of the float16 data type to the first operator, and every subsequent operator will invoke the float16 kernel until we get the final output in float16. So the preliminary requirements for float16 inference are to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax. @@ -75,7 +75,7 @@ In this scenario, we already have a float32 inference program and some associate We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. Consequently, we need to add cast operators in the float16 inference program for conversions between the float16 tensor and float32 tensor. -The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference). +The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference). ### Experiment results Simply running the following commands to reproduce the experiment results presented in this section: @@ -113,7 +113,7 @@ We repeat the test ten times and get the following results: | #10 | 62.53% | 62.48% | | average| 62.63% | 62.62% | -We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests. +We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests. #### Performance benchmark Currently, Fluid only supports float16 inference on NVIDIA GPUs. There is no motivation to support float16 inference on non-ARM CPUs where float16 is not natively supported, and float16 calculation will only be slower than its float32 counterpart. @@ -132,7 +132,7 @@ Average inference time for one mini-batch on Vgg16 model tested on ImageNet data |float16| 3.32 | 4.11 | 5.88 | 9.41 | 16.54 | 30.47 | 60.23 | |Speedup| 4.22 | 2.36  | 3.91 | 3.00 | 3.26  | 2.77 | 2.97 | -We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes. +We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes. Convolution operation is ususally the computational bottleneck of CNN, so we also check the average time spent on the Fluid convolution operators for one mini-batch as follows: @@ -162,7 +162,7 @@ We find that the speedup provided by float16 inference starts relatively small a We also did the same benchmark on a single NVIDIA GeForce GTX 1080 Ti GPU that does not support Tensor Core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then delivers around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that Tensor Core, which is specialized for float16 computations, is a critical component of high performance float16 inference. -Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/contrib/float16/float16_benchmark.md) for complete benchmark results. +Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/float16/float16_benchmark.md) for complete benchmark results. ### Summary 1. Fluid is now able to run inference in float16 mode via a float16 transpiler. We currently support CNN programs, including Vgg and Resnet, to run in float16 inference mode. diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 87eb30169a52b9c817821214e1b1708a09dc0f50..a877c066aef54ad396242ed936d9cc9065f6dda1 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -10,9 +10,12 @@ paddle.fluid.default_startup_program (ArgSpec(args=[], varargs=None, keywords=No paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '5430f54ab4895f9f47db6bebbaf71659')) paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b54f403e57825a1592aece03afe3afb6')) paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2')) +paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', '7d9a51fc9cf3c5245b5227080a8064c3')) +paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '4c0cd83f0b401fc2ff84c70974e5d210')) +paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd0c3ebd813c39958c92b78e3eef7e912')) paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03')) -paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'aba8093edebf2d5c869b735b92811e45')) +paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'f482e93b38b4018796969a2e1dde479d')) paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0')) paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2')) paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -44,7 +47,7 @@ paddle.fluid.AsyncExecutor.run (ArgSpec(args=['self', 'program', 'data_feed', 'f paddle.fluid.AsyncExecutor.save_model (ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None), ('document', 'c8ac0dfcb3b187aba25d03af7fea56b2')) paddle.fluid.AsyncExecutor.stop (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '5f23d043607bb5d55e466ec3f578e093')) paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'e1af7fd53cf868554f312779fc803864')) +paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', 'a8c7793803cf976680d9478e378fa356')) paddle.fluid.CompiledProgram.with_inference_optimize (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None), ('document', '9e5b009d850191a010e859189c127fd8')) paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None @@ -56,8 +59,14 @@ paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_pr paddle.fluid.io.load_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '0a5308f496632ab1ec3ba1f1377e6f95')) paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '41779819cef32f2246e83aebc5a002e2')) paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '28df5bfe26ca7a077f91156abb0fe6d2')) -paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '582d87b8df75a5a639a107db8ff86f9c')) +paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '70f4f53f13572436ac72d1c8b5efeb9d')) paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '7a5255386075dac3c75b7058254fcdcb')) +paddle.fluid.io.PyReader.__init__ (ArgSpec(args=['self', 'feed_list', 'capacity', 'use_double_buffer', 'iterable'], varargs=None, keywords=None, defaults=(True, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.io.PyReader.decorate_batch_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a3fefec8bacd6ce83f49906a9d05e779')) +paddle.fluid.io.PyReader.decorate_sample_generator (ArgSpec(args=['self', 'sample_generator', 'batch_size', 'drop_last', 'places'], varargs=None, keywords=None, defaults=(True, None)), ('document', '7abd9cf7d695bab5bb6cf7ded5903cb2')) +paddle.fluid.io.PyReader.decorate_sample_list_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', 'faef298f73e91aedcfaf5d184f3109b7')) +paddle.fluid.io.PyReader.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'ff1cc1e2beb8824d453656c72c28ddfb')) +paddle.fluid.io.PyReader.start (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'b7ea0a548991924e4cfe61a577b8e56d')) paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.initializer.NormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -68,7 +77,7 @@ paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'unifor paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '6d0f3e22c90d9d500d36ff57daf056ee')) paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'a6d7011ca3d8c0d454dac3a56eae0c29')) paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '1929058262994f212620599c63aea6bd')) +paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '424e898365195e3ccbc2e7dc8b63605e')) paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '89c2c55a0b0656b106064048e068e77a')) paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'dfbb624f85015df29e994ca6999e8ff6')) paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b4b608b986eb9617aa0525e1be21d32d')) @@ -91,7 +100,7 @@ paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'po paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625')) paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95')) paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '120f4323a3d7ed9c0916f15a59f0e497')) -paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', 'c527b71b8a4c60dca8df8a745c2b598d')) +paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '320c6973b02ea179fa89fecc80796464')) paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', 'e45e09e65a2658e07cad987222f0d9ab')) paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b0b8d53821716cd50c42e09b593f3feb')) paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '03993955ab1e6d3044c44e6f17fc85e9')) @@ -109,7 +118,7 @@ paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3')) paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce')) paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6')) -paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', 'dc7042734c6d8b8ce97321f017f01d6f')) +paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', 'f1dd22f7351f7f9853212958e0d8aa7a')) paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6')) paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2')) paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571')) @@ -144,7 +153,7 @@ paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon' paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'c317aa595deb31649083c8faa91cdb97')) paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '12c5bbb8b38c42e623fbc47611d766e1')) paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)), ('document', '1ba0508d573f65feecf3564dce22aa1d')) -paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', 'b3ecb819454832885c1f0f3ab9a5b938')) +paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', '7a1966d7c3a48f1fc0881cdaf5d83b0b')) paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', '06211aefc50c5a3e940d7204d859cdf7')) paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', 'e4fb4ed511b2293b8f04f7e872afbfd7')) paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', '735fa9758a6d7ff3b47d7b827f961c1d')) @@ -202,10 +211,10 @@ paddle.fluid.layers.mean (ArgSpec(args=['x', 'name'], varargs=None, keywords=Non paddle.fluid.layers.mul (ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)), ('document', 'ccd37fa6b53f074adbfb732d738c4c2d')) paddle.fluid.layers.sigmoid_cross_entropy_with_logits (ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False)), ('document', '180c284317ea45ef89a460d8d79c0b72')) paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '891870d069a6aea746d34cc53b61690c')) -paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5f207ae10589ebe38a63575ef6ff8e1e')) +paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a9221eaef53884a00654e028551b78e2')) paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '51def402b8910e163cbace9d0c0526ed')) paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '77a6d80aa5551ca70324fc975c44507f')) -paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)), ('document', '2f46f1ff39a13ab00857e7b9f44b2fa7')) +paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', 'ab84fdc6dc60f3ad9aa397e6007e3bf9')) paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '70e3b5182a18b40b47ecabd7c8490a35')) paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', '9bb77f8dc002dd2ce75d4769eaaf5007')) paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd256cba1c41a5ed92ce3f31e24a2ca6d')) @@ -222,6 +231,8 @@ paddle.fluid.layers.psroi_pool (ArgSpec(args=['input', 'rois', 'output_channels' paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)), ('document', '2f6ff96864054a31aa4bb659c6722c99')) paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '431a4301c35032166ec029f7432c80a7')) paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607')) +paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329')) +paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393')) paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139')) paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc')) paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', 'b0a1c2fc51c27a106da28f3308c41f5e')) @@ -229,7 +240,7 @@ paddle.fluid.layers.shuffle (ArgSpec(args=['reader', 'buffer_size'], varargs=Non paddle.fluid.layers.batch (ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', 'f563d376d35e1a4c4db100fd11b381a0')) paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '07e5b796674796eb1ef3fee9c10d24e3')) paddle.fluid.layers.random_data_generator (ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)), ('document', '9b7f0f86ec24bbc97643cadcb6499cff')) -paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '13dabc57863f62ab3141586784ee356b')) +paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '4357643685cfd65454ba5a15f0151709')) paddle.fluid.layers.create_py_reader_by_data (ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)), ('document', '350f74d93fab9adb2ac4950f1c26416b')) paddle.fluid.layers.Preprocessor.__init__ (ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.Preprocessor.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -239,7 +250,7 @@ paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], var paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'c0c3d0194f83fff8ea99ce0820657dae')) paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'd62b866c899bc1fedb5385f95b88e1f8')) paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', 'ab914fac893607e29ac6e52bbdbea1a4')) -paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '60cb8f843d625abf33f8bf12455b8f99')) +paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '992eb42590fc1c380841a6db72ce78b3')) paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'b12717d3d4567e6119589f7f655b0cbb')) paddle.fluid.layers.concat (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b19b79be4f05e85d1d6cec642c9fb535')) paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '42912092418620b4be07f36af31e7816')) @@ -255,6 +266,7 @@ paddle.fluid.layers.reverse (ArgSpec(args=['x', 'axis'], varargs=None, keywords= paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '8f8c0306117ea441f20dcbbdba1f0ecc')) paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8')) paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292')) +paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb')) paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -277,7 +289,7 @@ paddle.fluid.layers.DynamicRNN.block (ArgSpec(args=['self'], varargs=None, keywo paddle.fluid.layers.DynamicRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')), ('document', 'b9174d4e91505b0c8ecc193eb51e248d')) paddle.fluid.layers.DynamicRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'b439a176a3328de8a75bdc5c08eece4a')) paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', 'f29ad2478b6b2ad4f413d2936a331ea0')) -paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '169d694d2224f62b4f3afdc3dbc19e95')) +paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '7568c5ac7622a10288d3307a94134655')) paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f')) paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'c24e368e23afac1ed91a78a639d7a9c7')) @@ -293,13 +305,16 @@ paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords= paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3')) paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b')) paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6')) +paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a46e0b5f9ce82348406478e610f14c9')) paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7')) paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13')) -paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27')) +paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9e27491c39ac74d0b1ffe506aec0ebb')) paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '64650ac42cf82e9920cb0b172b1d29fd')) paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad')) paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973')) paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2')) +paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '920a47734482276c069ba24c61c26b25')) +paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cf4ee2c9b9d7293556f8c5173dfb5d2c')) paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4')) paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2')) paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26')) @@ -327,9 +342,11 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes', paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1')) paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e')) paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b')) -paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691')) +paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gtscore', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '57fa96922e42db8f064c3fb77f2255e8')) +paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340')) paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e')) paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0')) +paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d')) paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '005a5ae47d6c8fff721931d69d072b9f')) paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', '9808534c12c5e739a10f73ebb0b4eafd')) paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', 'e0e95334fce92d16c2d9db6e7caffc47')) @@ -363,7 +380,7 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array (ArgSpec(args=['self', 'init', paddle.fluid.contrib.BeamSearchDecoder.update_array (ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None), ('document', '5754e9b3212b7c09497151516a0de5a7')) paddle.fluid.contrib.memory_usage (ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8fcb2f93bb743693baa8d4860a5ccc47')) paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4d43687113c4bf5b29d15aee2f4e4afa')) -paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000)), ('document', '14b39f1fcd5667ff556b1aad94357d1d')) +paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size', 'moving_rate'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000, 0.9)), ('document', '14b39f1fcd5667ff556b1aad94357d1d')) paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd')) paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884')) @@ -371,26 +388,12 @@ paddle.fluid.contrib.Calibrator.__init__ (ArgSpec(args=['self'], varargs='args', paddle.fluid.contrib.Calibrator.sample_data (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3b8c85ca1e2cf753cc8c90a6c6992958')) paddle.fluid.contrib.Calibrator.save_int8_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.reader.ctr_reader.ctr_reader (ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2ebf3de2a6ef1af2c3b88d2db7591ab')) -paddle.fluid.contrib.build_compressor (ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.CompressPass.__init__ (ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.CompressPass.add_strategy (ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None), ('document', '3bf6010b6f47d3c86df0ec8957be95e0')) -paddle.fluid.contrib.CompressPass.apply (ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None), ('document', 'a92bf85d4b59bd4f2ac1706d7c4899a6')) -paddle.fluid.contrib.ImitationGraph.__init__ (ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.ImitationGraph.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.SensitivePruneStrategy.__init__ (ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.MagnitudePruner.__init__ (ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.MagnitudePruner.prune (ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.RatioPruner.__init__ (ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a325b296a9ca502ee5adb4fc85d')) -paddle.fluid.contrib.RatioPruner.prune (ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)), ('document', '358cbf2978c91028fb96a195a9884645')) -paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '11fbf7e8dd2289805de291b453a33ee7')) -paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '5b5577bb3d24070da819674255d16196')) -paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4efbd93876832d4d35497cdbc7a1e6d8')) +paddle.fluid.contrib.Compressor.__init__ (ArgSpec(args=['self', 'place', 'scope', 'train_program', 'train_reader', 'train_feed_list', 'train_fetch_list', 'eval_program', 'eval_reader', 'eval_feed_list', 'eval_fetch_list', 'teacher_programs', 'checkpoint_path', 'train_optimizer', 'distiller_optimizer'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, [], './checkpoints', None, None)), ('document', '31ae143830c9bf6b43547dd546c5ba80')) +paddle.fluid.contrib.Compressor.config (ArgSpec(args=['self', 'config_file'], varargs=None, keywords=None, defaults=None), ('document', '780d9c007276ccbb95b292400d7807b0')) +paddle.fluid.contrib.Compressor.run (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'c6e43d6a078d307672283c1f36e04fe9')) +paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '2ab36d4f7a564f5f65e455807ad06c67')) +paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '59066bac9db0ac6ce414d05780b7333f')) +paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '74c39c595dc70d6be2f16d8e462d282b')) paddle.fluid.contrib.HDFSClient.__init__ (ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.HDFSClient.delete (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', 'c3721aa2d4d9ef5a857dd47b2681c03e')) paddle.fluid.contrib.HDFSClient.download (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'ca55bde92184d3fd0f9f5c963b25e634')) @@ -427,49 +430,65 @@ paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'poo paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.SGDOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.SGDOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.MomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.MomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.MomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.AdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.AdamOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdamOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.AdamaxOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.FtrlOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.FtrlOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.FtrlOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.RMSPropOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.RMSPropOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdadeltaOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.ModelAverage.__init__ (ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '46234a5470590feb336346f70a3db715')) paddle.fluid.optimizer.ModelAverage.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.ModelAverage.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '18db9c70be9c4dd466f9844457b21bfe')) paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) +paddle.fluid.optimizer.DGCMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'rampup_begin_step', 'rampup_step', 'sparsity', 'use_nesterov', 'local_grad_clip_norm', 'num_trainers', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1, [0.999], False, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.DGCMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) +paddle.fluid.optimizer.DGCMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.DGCMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.DGCMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1a79bd7d10ae54ca763ec81bca36ba24')) paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.regularizer.L2DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -489,7 +508,7 @@ paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinne paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', '0eed2f198dc73c08a41b61edbc755753')) +paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', 'f8f3df23c5633c614db781a91b81fb62')) paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', '459e316301279dfd82001b46f0b8ffca')) paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '543863d1f9d4853758adb613b8659e85')) paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -507,17 +526,19 @@ paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, ke paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310')) paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7')) paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope +paddle.fluid.install_check.run_check (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '66b7c84a17ed32fec2df9628367be2b9')) +paddle.reader.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c')) paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d')) paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb')) paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None), ('document', '884291104e1c3f37f33aae44b7deeb0d')) paddle.reader.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4')) paddle.reader.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d')) paddle.reader.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad')) -paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '283bc0b8a0e26ae186b8b9bee4aec560')) +paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '9c804a42f8a4dbaa76b3c98e0ab7f796')) paddle.reader.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '5f80a7ed70052f01665e4c74acccfa69')) +paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '9621ae612e595b6c34eb3bb5f3eb1a45')) paddle.reader.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0')) paddle.reader.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.reader.creator.np_array (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '28d457fbc9a71efa4ac91a3be179cada')) -paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', '44fe286ab6175a5464d3a961a68c266a')) -paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', '11b3704ea42cfd537953387a7e58dae8')) +paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', 'f45fcb7add066c8e042c6774fc7c3db2')) +paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', 'b4a94ee0e2cefb495619275c2f8c61d2')) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 7ddf1ab44fe096739f4d241994e5cb686970a7c5..4d54754cec00dc435000138d4f297af243813fc3 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -38,10 +38,10 @@ if(WITH_GPU) nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) add_dependencies(tensor tensor_util) else() - nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context ) + nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler) endif(WIN32) else() - cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context ) + cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler) endif() cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) @@ -63,7 +63,7 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) -cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory) +cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog) cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_test(reader_test SRCS reader_test.cc DEPS reader) @@ -164,6 +164,8 @@ else() set(NGRAPH_EXE_DEPS) endif() +cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector) + if(WITH_DISTRIBUTE) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS}) @@ -174,7 +176,7 @@ else() cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() -target_link_libraries(executor garbage_collector) +target_link_libraries(executor while_op_helper executor_gc_helper) cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor @@ -194,6 +196,7 @@ cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_con cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry proto_desc) cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper) + cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index dc308fd2592bb158f46f6eac9dd0df25787559fe..046ec6978a84fa6eba11513860523de5a63a31d8 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -5,10 +5,12 @@ cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_h cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry) +cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper) cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper) cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper) +cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper) cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) @@ -21,6 +23,8 @@ endif() if(WITH_GPU) nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + dynload_cuda variable_visitor dgc) + nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory dynload_cuda variable_visitor) if(WITH_DISTRIBUTE) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope @@ -35,6 +39,8 @@ if(WITH_GPU) else() cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory variable_visitor) + cc_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + variable_visitor) if(WITH_DISTRIBUTE) cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim selected_rows_functor sendrecvop_rpc) @@ -46,9 +52,7 @@ else() cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) endif() -cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) -cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) if(WITH_GPU) cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info) @@ -61,14 +65,17 @@ cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_ cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) -cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) +cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle) +cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass) cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass) cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle - scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) + scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle) + +cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle) set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass) if (WITH_GPU) @@ -97,5 +104,5 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass fuse_elewise_add_act_pass multi_batch_merge_pass - fuse_relu_depthwise_conv_pass - memory_optimize_pass lock_free_optimize_pass) + fuse_relu_depthwise_conv_pass + memory_optimize_pass lock_free_optimize_pass alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass) diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc index ff223e616f7ef0c794e72a0028c7e5bb3f234ec0..98a74d630cd4f32b064e95ddc6082e2d2ad657e1 100644 --- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc +++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include #include @@ -52,13 +53,28 @@ std::unique_ptr AllReduceDepsPass::ApplyImpl( // Note that must assert topology sort is stable auto& ops = graph->Get>(kStaleProgramOpDescs); for (auto* op_desc : ops) { - auto outputs = op_desc->Outputs(); - for (auto& o_it : outputs) { - for (auto& v : o_it.second) { // values - vars[v] = order; + try { + bool is_bk_op = + static_cast(boost::get(op_desc->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; + + auto backward_vars = + boost::get>(op_desc->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + auto outputs = op_desc->Outputs(); + for (auto& o_it : outputs) { + for (auto& v : o_it.second) { // values + vars[v] = order; + VLOG(1) << "in all_reduce_deps_pass:" << v; + } } + order++; + } catch (boost::bad_get e) { } - order++; } std::vector dist_ops; @@ -70,7 +86,8 @@ std::unique_ptr AllReduceDepsPass::ApplyImpl( } } - VLOG(10) << "dist_ops size:" << dist_ops.size() << std::endl; + VLOG(10) << "dist_ops size:" << dist_ops.size() + << ", outputs size:" << vars.size() << ", ops size:" << ops.size(); std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1, OpHandleBase* op2) { @@ -83,6 +100,10 @@ std::unique_ptr AllReduceDepsPass::ApplyImpl( auto l_it = vars.find(i0->name()); auto r_it = vars.find(i1->name()); + PADDLE_ENFORCE(l_it != vars.end() && r_it != vars.end(), + "can't find var's name %s and %s in opdesc", i0->name(), + i1->name()); + if (l_it->second < r_it->second) return true; if (l_it->second == r_it->second) { diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index c1f9c2b60c915370df7793f26fe83812a7ced96d..6e477cd2977561ddb914e4a6343f677044fad4be 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -11,12 +11,18 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include - #include "paddle/fluid/framework/details/all_reduce_op_handle.h" +#include #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/framework/operator.h" + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "dgc/dgc.h" +#endif + +#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/profiler.h" // asynchronous nccl allreduce or synchronous issue: @@ -34,11 +40,14 @@ namespace details { AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, - const platform::NCCLContextMap *ctxs) + const platform::NCCLContextMap *ctxs, + bool is_encoded, int nranks) : OpHandleBase(node), local_scopes_(local_scopes), places_(places), - nccl_ctxs_(ctxs) { + nccl_ctxs_(ctxs), + is_encoded_(is_encoded), + nranks_(nranks) { if (nccl_ctxs_) { for (auto &p : places_) { this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p)); @@ -52,10 +61,189 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} #endif +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +void AllReduceOpHandle::RunImplEncoded() { + platform::RecordEvent record_event(Name()); + + WaitInputVarGenerated(); + + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), places_.size(), + "The NoDummyInputSize should be equal to the number of places."); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); + + std::vector ins; + std::vector outs; + int k = -1; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &local_scope = + local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); + auto original_name = + paddle::framework::GradOriginalVarName(in_var_handles[i]->name()); + auto encode_var_name = original_name + g_dgc_encoded; + auto *in_var = local_scope->FindVar(encode_var_name); + PADDLE_ENFORCE_NOT_NULL(in_var); + auto &in = in_var->Get(); + ins.emplace_back(&in); + + auto *out = local_scope->FindVar(out_var_handles[i]->name()) + ->GetMutable(); + outs.emplace_back(out); + + if (k < 0) { + k = GetKValue(in_var_handles[i]->name()); + } + } + + PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place())); + PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place())); + PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); + + int dtype = -1; + size_t in_numel = 0; + size_t out_numel = 0; + PADDLE_ENFORCE(nranks_ > 1); + std::vector> all_reduce_calls; + + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &place = places_[i]; + auto &in = *ins[i]; + void *in_tensor_buf = const_cast(in.data()); + + auto &out = *outs[i]; + float *out_tensor_buf = out.data(); + + dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype; + in_numel = (in_numel == 0) ? static_cast(in.numel()) : in_numel; + PADDLE_ENFORCE(in_numel % 2 == 0); + PADDLE_ENFORCE(in_numel / 2 == static_cast(k)); + out_numel = (out_numel == 0) ? static_cast(out.numel()) : out_numel; + + int dev_id = boost::get(place).device; + auto &nccl_ctx = nccl_ctxs_->at(dev_id); + auto stream = nccl_ctx.stream(); + auto comm = nccl_ctx.comm_; + + auto &allocator = + platform::DeviceTemporaryAllocator::Instance().Get(place, stream); + int encode_size = 2 * k * sizeof(int); + // dgc use ncclAllGather to get all the encoded data + // so the buffer need nranks. + int buf_size = nranks_ * encode_size; + auto tmp_ious_data = allocator.Allocate(buf_size); + void *gather_buff = reinterpret_cast(tmp_ious_data->ptr()); + + VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel + << ", nranks:" << nranks_ << ", gather_buf size:" << buf_size + << ", k:" << k << ", place:" << place << ", dtype:" << dtype; + + all_reduce_calls.emplace_back([=] { + PADDLE_ENFORCE(paddle::communication::dgc::sparseAllGReduce( + in_tensor_buf, gather_buff, k, out_tensor_buf, out_numel, comm, + stream)); + }); + } + + this->RunAndRecordEvent([&] { + if (all_reduce_calls.size() == 1UL) { + // Do not use NCCLGroup when manage NCCL by per thread per device + all_reduce_calls[0](); + } else { + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); + } + } + }); + + if (FLAGS_sync_nccl_allreduce) { + for (auto &p : places_) { + int dev_id = boost::get(p).device; + auto &nccl_ctx = nccl_ctxs_->at(dev_id); + auto stream = nccl_ctx.stream(); + cudaError_t e_sync = cudaStreamSynchronize(stream); + if (e_sync != 0) { + LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync); + } + + cudaError_t e_get = cudaGetLastError(); + if (e_get != 0) { + LOG(FATAL) << "cudaGetLastError " << cudaGetErrorString(e_get) + << " errno:" << e_get; + } + } + } +} + +int AllReduceOpHandle::GetKValue(const std::string &grad_name) { + auto original_name = paddle::framework::GradOriginalVarName(grad_name); + auto var_name = original_name + g_dgc_k; + PADDLE_ENFORCE(local_scopes_.size() > 0); + + auto *scope = local_scopes_[0]; + auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get(); + auto var = local_scope->FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(var); + auto tensor = var->Get().data(); + return *tensor; +} +#endif + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +bool AllReduceOpHandle::IsEncoded() { + if (!is_encoded_) { + return false; + } + auto counter_name = g_dgc_counter_name; + auto step_name = g_dgc_rampup_begin_step; + PADDLE_ENFORCE(local_scopes_.size() > 0); + + auto *scope = local_scopes_[0]; + auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get(); + auto count_var = local_scope->FindVar(counter_name); + auto step_var = local_scope->FindVar(step_name); + if (count_var == nullptr || step_var == nullptr) { + PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name, + step_var); + } + + float count = *count_var->Get().data(); + float step = *step_var->Get().data(); + if (static_cast(count) < static_cast(step)) { + VLOG(10) << "in all_reduce currentstep:" << count + << " < rampup_begin_step:" << step + << " so not use sparse all reduce"; + return false; + } + + return true; +} +#else +bool AllReduceOpHandle::IsEncoded() { return false; } +#endif + void AllReduceOpHandle::RunImpl() { + if (!IsEncoded()) { + RunImplNormal(); + return; + } + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + RunImplEncoded(); +#else + PADDLE_THROW("Not compiled with CUDA"); +#endif +} + +void AllReduceOpHandle::RunImplNormal() { platform::RecordEvent record_event(Name()); WaitInputVarGenerated(); + auto in_var_handles = DynamicCast(this->Inputs()); auto out_var_handles = DynamicCast(this->Outputs()); PADDLE_ENFORCE_EQ( @@ -72,6 +260,8 @@ void AllReduceOpHandle::RunImpl() { auto &lod_tensor = local_scope.FindVar(in_var_handles[i]->name())->Get(); lod_tensors.emplace_back(&lod_tensor); + VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name() + << ", out_name:" << out_var_handles[i]->name(); PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(), "The name of input and output should be equal."); } @@ -99,13 +289,17 @@ void AllReduceOpHandle::RunImpl() { auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; + + VLOG(10) << "before all reduce buffer:" << buffer << ", numel:" << numel + << ", dev_id:" << dev_id << ", dtype:" << dtype + << ", place:" << p; + all_reduce_calls.emplace_back([=] { PADDLE_ENFORCE(platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, comm, stream)); }); } - this->RunAndRecordEvent([&] { if (all_reduce_calls.size() == 1UL) { // Do not use NCCLGroup when manage NCCL by per thread per device diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h index b449796fcaee73a6b84e0db2b5c76ff94bedcf08..ca75186f6ceed3e48fe9326e85738d91bde0ca70 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/all_reduce_op_handle.h @@ -28,11 +28,19 @@ namespace paddle { namespace framework { namespace details { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +constexpr char g_dgc_counter_name[] = "__g_dgc_counter__"; +constexpr char g_dgc_rampup_begin_step[] = "__g_rampup_begin_step__"; +constexpr char g_dgc_encoded[] = "__dgc_encoded__"; +constexpr char g_dgc_k[] = "__dgc_k__"; +#endif + struct AllReduceOpHandle : public OpHandleBase { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, - const platform::NCCLContextMap *ctxs); + const platform::NCCLContextMap *ctxs, + bool is_encoded = false, int nranks = -1); #else AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places); @@ -50,8 +58,14 @@ struct AllReduceOpHandle : public OpHandleBase { std::vector local_scopes_; std::vector places_; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + void RunImplEncoded(); const platform::NCCLContextMap *nccl_ctxs_; + bool is_encoded_{false}; + int nranks_{-1}; + int GetKValue(const std::string &grad_name); #endif + void RunImplNormal(); + bool IsEncoded(); }; } // namespace details diff --git a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..fbc8bbf56b0d1ce75fbb459038c63571d0a16cd3 --- /dev/null +++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc @@ -0,0 +1,393 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" +DEFINE_uint32(fuse_parameter_memory_size, 0, // 0 KB + "fuse_parameter_memory_size is up limited memory size " + "of one group parameters' gradient which is the input " + "of communication calling(e.g NCCLAllReduce). " + "The default value is 0, it means that " + "not set group according to memory_size."); +DEFINE_int32( + fuse_parameter_groups_size, 3, + "fuse_parameter_groups_size is the size of one group parameters' gradient. " + "The default value is a experimental result. If the " + "fuse_parameter_groups_size is 1, it means that the groups size is " + "the number of parameters' gradient. If the fuse_parameter_groups_size is " + "-1, it means that there are only one group. The default value is 3, it is " + "an experimental value."); + +namespace paddle { +namespace framework { +namespace details { + +static const char kUnKnow[] = "@UNKNOW@"; +static framework::proto::VarType::Type kDefaultDtype = + framework::proto::VarType::Type::VarType_Type_BOOL; + +class AllocContinuousSpaceForGradPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override { + ir::Graph &result = *graph; + + auto &places = Get>(kPlaces); + auto &local_scopes = Get>(kLocalScopes); + + ResetAttribute(kParamsAndGrads, &result); + ResetAttribute(kGroupGradsAndParams, &result); + + // NOTE: The operator nodes should be in topology order. + std::vector topo_nodes = ir::TopologySortOperations(result); + auto ¶ms_grads = result.Get(kParamsAndGrads); + for (auto &node : topo_nodes) { + RecordParamsAndGrads(node, ¶ms_grads); + } + + if (params_grads.size() == 0) { + VLOG(10) << "Doesn't find gradients"; + return std::move(graph); + } + + std::unordered_map vars; + for (ir::Node *node : result.Nodes()) { + if (node->IsVar() && node->Var()) { + // Note: The graph may have the same name node. For example, parameter + // is the input of operator and it also is the output of optimizer; + vars.emplace(node->Var()->Name(), node); + } + } + + auto &group_grads_params = + result.Get(kGroupGradsAndParams); + + // Note: the order of params_grads may be changed by SetGroupGradsAndParams. + SetGroupGradsAndParams(vars, params_grads, &group_grads_params); + + params_grads.clear(); + for (auto &group_p_g : group_grads_params) { + params_grads.insert(params_grads.begin(), group_p_g.begin(), + group_p_g.end()); + } + for (auto &p_g : params_grads) { + std::swap(p_g.first, p_g.second); + } + + // Set Gradients as Persistable to prevent this var becoming reusable. + auto dtype = kDefaultDtype; + for (auto &p_g : params_grads) { + // Get gradient var + auto iter = vars.find(p_g.second); + PADDLE_ENFORCE(iter != vars.end(), "%s is not found.", p_g.second); + iter->second->Var()->SetPersistable(true); + + PADDLE_ENFORCE(IsSupportedVarType(iter->second->Var()->GetType())); + + // Get Dtype + auto ele_dtype = iter->second->Var()->GetDataType(); + if (dtype == kDefaultDtype) { + dtype = ele_dtype; + PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype); + } + PADDLE_ENFORCE_EQ(ele_dtype, dtype); + } + + // Create the fused variable name. + if (!result.Has(kFusedVars)) { + result.Set(kFusedVars, new FusedVars); + } + const std::string prefix(kFusedVarNamePrefix); + // The fused_var_name should be unique. + auto fused_var_name = prefix + "GRAD@" + params_grads[0].second; + auto &fused_var_set = result.Get(kFusedVars); + PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0); + fused_var_set.insert(fused_var_name); + + InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars, + fused_var_name, params_grads); + + return std::move(graph); + } + + template + void ResetAttribute(const std::string &attr_name, ir::Graph *graph) const { + if (graph->Has(attr_name)) { + VLOG(10) << attr_name << " is reset."; + graph->Erase(attr_name); + } + graph->Set(attr_name, new AttrType); + } + + void SetGroupGradsAndParams( + const std::unordered_map &var_nodes, + const ParamsAndGrads ¶ms_grads, + GroupGradsAndParams *group_grads_params) const { + SetGroupAccordingToLayers(var_nodes, params_grads, group_grads_params); + SetGroupAccordingToMemorySize(var_nodes, group_grads_params); + SetGroupAccordingToGroupSize(var_nodes, group_grads_params); + } + + void SetGroupAccordingToLayers( + const std::unordered_map &var_nodes, + const ParamsAndGrads ¶ms_grads, + GroupGradsAndParams *group_grads_params) const { + std::unordered_map> layer_params; + + for (size_t i = 0; i < params_grads.size(); ++i) { + auto pos = params_grads[i].first.find_first_of("."); + if (pos == std::string::npos) { + layer_params[std::string(kUnKnow)].emplace_back(i); + } else { + layer_params[params_grads[i].first.substr(0, pos)].emplace_back(i); + } + } + + group_grads_params->reserve(layer_params.size()); + for (size_t i = 0; i < params_grads.size(); ++i) { + auto pos = params_grads[i].first.find_first_of("."); + std::string key = kUnKnow; + if (pos != std::string::npos) { + key = params_grads[i].first.substr(0, pos); + } + auto iter = layer_params.find(key); + if (iter == layer_params.end()) continue; + + group_grads_params->emplace_back(); + auto &local_group_grads_params = group_grads_params->back(); + for (auto &idx : iter->second) { + local_group_grads_params.emplace_back( + std::make_pair(params_grads[idx].second, params_grads[idx].first)); + } + layer_params.erase(iter); + } + + VLOG(10) << "SetGroupAccordingToLayers: "; + for (size_t i = 0; i < group_grads_params->size(); ++i) { + VLOG(10) << "group " << i; + std::stringstream out; + for (auto &p_g : group_grads_params->at(i)) { + out << "(" << p_g.second << ", " << p_g.first << "), "; + } + VLOG(10) << out.str(); + } + } + + void SetGroupAccordingToMemorySize( + const std::unordered_map &var_nodes, + GroupGradsAndParams *group_grads_params) const { + if (FLAGS_fuse_parameter_memory_size == 0) { + return; + } + size_t group_memory_size = + static_cast(FLAGS_fuse_parameter_memory_size); + GroupGradsAndParams local_group_grads_params; + + size_t j = 0; + while (j < group_grads_params->size()) { + local_group_grads_params.emplace_back(); + auto &group_p_g = local_group_grads_params.back(); + size_t local_group_memory_size = 0; + while (j < group_grads_params->size()) { + std::for_each( + group_grads_params->at(j).begin(), group_grads_params->at(j).end(), + [&local_group_memory_size, + &var_nodes](const std::pair &g_p) { + auto iter = var_nodes.find(g_p.second); + PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", + g_p.second); + auto shape = iter->second->Var()->GetShape(); + size_t size = + framework::SizeOfType(iter->second->Var()->GetDataType()); + std::for_each(shape.begin(), shape.end(), + [&size](const int64_t &n) { size *= n; }); + local_group_memory_size += size; + }); + group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(), + group_grads_params->at(j).end()); + ++j; + if (local_group_memory_size >= group_memory_size) { + break; + } + } + } + + std::swap(*group_grads_params, local_group_grads_params); + + VLOG(10) << string::Sprintf( + "SetGroupAccordingToMemorySize(memory_size: %d):", + FLAGS_fuse_parameter_memory_size); + for (size_t i = 0; i < group_grads_params->size(); ++i) { + VLOG(10) << "group " << i; + std::stringstream out; + for (auto &g_p : group_grads_params->at(i)) { + auto iter = var_nodes.find(g_p.second); + PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", g_p.second); + auto shape = iter->second->Var()->GetShape(); + size_t size = framework::SizeOfType(iter->second->Var()->GetDataType()); + std::for_each(shape.begin(), shape.end(), + [&size](const int64_t &n) { size *= n; }); + out << string::Sprintf("(%s(%d), %s)", g_p.second, size, g_p.first); + } + VLOG(10) << out.str(); + } + } + + void SetGroupAccordingToGroupSize( + const std::unordered_map &var_nodes, + GroupGradsAndParams *group_grads_params) const { + if (FLAGS_fuse_parameter_groups_size == 1) { + return; + } + size_t group_size = static_cast(FLAGS_fuse_parameter_groups_size); + if (FLAGS_fuse_parameter_groups_size == -1) { + group_size = group_grads_params->size(); + } + PADDLE_ENFORCE_GT(group_size, 1); + size_t groups = (group_grads_params->size() + group_size - 1) / group_size; + GroupGradsAndParams local_group_grads_params; + local_group_grads_params.reserve(groups); + + size_t j = 0; + for (size_t i = 0; i < groups; ++i) { + local_group_grads_params.emplace_back(); + auto &group_p_g = local_group_grads_params.back(); + group_p_g.reserve(group_size); + while (j < group_grads_params->size()) { + group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(), + group_grads_params->at(j).end()); + ++j; + if (j % group_size == 0) break; + } + } + std::swap(*group_grads_params, local_group_grads_params); + + VLOG(10) << "SetGroupAccordingToGroupSize(group_size: " << group_size + << "): "; + for (size_t i = 0; i < group_grads_params->size(); ++i) { + VLOG(10) << "group " << i; + std::stringstream out; + for (auto &p_g : group_grads_params->at(i)) { + out << "(" << p_g.second << ", " << p_g.first << "), "; + } + VLOG(10) << out.str(); + } + } + + private: + bool IsSupportedVarType(const proto::VarType::Type &type) const { + // Current only support LOD_TENSOR. + return type == proto::VarType::LOD_TENSOR; + } + + void AppendAllocSpaceForVarsOp(const std::vector ¶ms_name, + const std::vector &grads_name, + const std::string &fused_var_name, + BlockDesc *global_block) const { + auto op_desc = global_block->AppendOp(); + op_desc->SetType("alloc_continuous_space"); + op_desc->SetInput("Input", params_name); + op_desc->SetOutput("Output", grads_name); + op_desc->SetOutput("FusedOutput", {fused_var_name}); + } + + void RecordParamsAndGrads(ir::Node *node, + ParamsAndGrads *params_grads) const { + try { + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) return; + + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. + auto backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast(0)); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + VLOG(10) << "Trainable parameter: " << backward_vars[i] + << ", gradient: " << backward_vars[i + 1]; + + params_grads->emplace_back(std::make_pair( + backward_vars[i] /*param*/, backward_vars[i + 1] /*grad*/)); + } + } catch (boost::bad_get e) { + } + } + + void InitFusedVarsAndAllocSpaceForVars( + const std::vector &places, + const std::vector &local_scopes, + const std::unordered_map &vars, + const std::string &fused_var_name, + const ParamsAndGrads ¶ms_grads) const { + // Init Gradients and FusedVars + VLOG(10) << "Init FusedVars and Gradients."; + for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) { + auto &scope = *it; + + PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr, + "%s has existed in scope.", fused_var_name); + scope->Var(fused_var_name)->GetMutable(); + + for (auto &p_g : params_grads) { + auto iter = vars.find(p_g.second); + PADDLE_ENFORCE(iter != vars.end()); + PADDLE_ENFORCE_NOT_NULL(iter->second->Var()); + PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(), + proto::VarType::LOD_TENSOR); + scope->Var(p_g.second)->GetMutable(); + } + } + + std::vector grads_name; + std::vector params_name; + grads_name.reserve(params_grads.size()); + params_name.reserve(params_grads.size()); + for (auto &p_g : params_grads) { + params_name.emplace_back(p_g.first); + grads_name.emplace_back(p_g.second); + } + framework::ProgramDesc program_desc; + AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name, + program_desc.MutableBlock(0)); + + // Run Only Once Programs + for (size_t i = 0; i < local_scopes.size(); ++i) { + for (auto &op_desc : program_desc.Block(0).AllOps()) { + auto op = OpRegistry::CreateOp(*op_desc); + op->Run(*local_scopes[i], places[i]); + } + } + } +}; + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(alloc_continuous_space_for_grad_pass, + paddle::framework::details::AllocContinuousSpaceForGradPass) + .RequirePassAttr(paddle::framework::details::kPlaces) + .RequirePassAttr(paddle::framework::details::kLocalScopes); diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 0c75e05f861636565ae855ddd534c1082d40d237..0b4d33513506d41a63db8316abaa5cd0458ff352 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -57,7 +57,7 @@ struct BroadcastOpHandle : public OpHandleBase { std::string Name() const override; - bool IsMultiDeviceTransfer() override { return false; }; + bool IsMultiDeviceTransfer() override { return true; }; protected: void RunImpl() override; diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 2cfc76e47f41862731fb2de5d1d03287acd4d9d7..5d9db237538599ec9a6887317b61af73f1113b97 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h" @@ -45,12 +46,27 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { public: explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy) : ir::PassBuilder(), strategy_(strategy) { + // Add a graph viz pass to record a graph. + if (!strategy_.debug_graphviz_path_.empty()) { + auto viz_pass = AppendPass("graph_viz_pass"); + const std::string graph_path = string::Sprintf( + "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph"); + viz_pass->Set("graph_viz_path", new std::string(graph_path)); + } + if (strategy_.enable_sequential_execution_) { + VLOG(10) << "Add sequential_execution_pass"; AppendPass("sequential_execution_pass"); } + // Add op fusion. + if (strategy.sync_batch_norm_) { + AppendPass("sync_batch_norm_pass"); + } + // Add op fusion. if (strategy.fuse_relu_depthwise_conv_) { + VLOG(10) << "Add fuse_relu_depthwise_conv_pass"; AppendPass("fuse_relu_depthwise_conv_pass"); } @@ -62,29 +78,30 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Add automatically inplace. if (strategy_.enable_inplace_) { + VLOG(10) << "Add inplace_pass"; AppendPass("inplace_pass"); } + if (strategy.fuse_elewise_add_act_ops_) { + VLOG(10) << "Add fuse_elewise_add_act_pass"; + AppendPass("fuse_elewise_add_act_pass"); + } + + // for single card training, fuse_all_reduce_ops is unnecessary. + // alloc_continuous_space_for_grad_pass should be before of MultiDevPass. + if (strategy.fuse_all_reduce_ops_) { + VLOG(10) << "Add alloc_continuous_space_for_grad_pass"; + AppendPass("alloc_continuous_space_for_grad_pass"); + } + // Add a graph viz pass to record a graph. - if (!strategy_.debug_graphviz_path_.empty()) { + if (!strategy.debug_graphviz_path_.empty()) { auto viz_pass = AppendPass("graph_viz_pass"); const std::string graph_path = string::Sprintf( - "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph"); + "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph"); viz_pass->Set("graph_viz_path", new std::string(graph_path)); } - if (strategy.fuse_elewise_add_act_ops_) { - auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass"); - // Add a graph viz pass to record a graph. - if (!strategy.debug_graphviz_path_.empty()) { - auto viz_pass = AppendPass("graph_viz_pass"); - const std::string graph_path = string::Sprintf( - "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph"); - viz_pass->Set("graph_viz_path", - new std::string(graph_path)); - } - } - CollectiveContext *context = CollectiveContext::GetInstance(); context->endpoints_ = strategy_.trainers_endpoints_; context->trainer_id_ = strategy_.trainer_id_; @@ -102,11 +119,19 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // A side-effect of that, memory optimize cannot forsee the fetched vars // , so fetchlist should be set persistable before call the Run interface. if (strategy.memory_optimize_) { - auto memory_optimize_pass = AppendPass("memory_optimize_pass"); + VLOG(10) << "Add memory_optimize_pass"; + AppendPass("memory_optimize_pass"); } AppendMultiDevPass(strategy); + if (strategy.fuse_all_reduce_ops_) { + // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator + // first, if the number is zero, fuse_all_reduce_ops will do nothing. + VLOG(10) << "Add fuse_all_reduce_op_pass"; + AppendPass("fuse_all_reduce_op_pass"); + } + // Add a graph print pass to record a graph with device info. if (!strategy_.debug_graphviz_path_.empty()) { auto multi_devices_print_pass = AppendPass("multi_devices_print_pass"); @@ -122,28 +147,34 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Verify that the graph is correct for multi-device executor. AppendPass("multi_devices_check_pass"); + if (VLOG_IS_ON(2)) { + AppendPass("all_reduce_deps_pass"); + } + if (SeqOnlyAllReduceOps(strategy)) { + VLOG(10) << "Add all_reduce_deps_pass"; AppendPass("all_reduce_deps_pass"); } if (strategy_.remove_unnecessary_lock_) { + VLOG(10) << "Add modify_op_lock_and_record_event_pass"; AppendPass("modify_op_lock_and_record_event_pass"); } } // Convert graph to run on multi-devices. void AppendMultiDevPass(const BuildStrategy &strategy) { - ir::Pass *multi_devices_pass; + ir::Pass *multi_devices_pass = nullptr; if (strategy_.is_distribution_) { - VLOG(3) << "multi device parameter server mode"; + VLOG(10) << "Add dist_multi_devices_pass"; multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); } else { if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { - VLOG(3) << "multi devices collective mode with allreduce"; + VLOG(10) << "Add all_reduce_mode_multi_devices_pass"; multi_devices_pass = - AppendPass("allreduce_mode_multi_devices_pass").get(); + AppendPass("all_reduce_mode_multi_devices_pass").get(); } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { - VLOG(3) << "multi deivces collective mode with reduce"; + VLOG(10) << "Add reduce_mode_multi_devices_pass"; multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get(); } else { PADDLE_THROW("Unknown reduce strategy."); @@ -200,9 +231,26 @@ std::unique_ptr BuildStrategy::Apply( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; - pass->Erase("nccl_ctxs"); - pass->SetNotOwned("nccl_ctxs", nctx); + pass->Erase(kNCCLCtxs); + pass->SetNotOwned(kNCCLCtxs, nctx); +#endif + } else if (pass->Type() == "fuse_all_reduce_op_pass") { + pass->Erase(kPlaces); + pass->SetNotOwned>(kPlaces, &places); + pass->Erase(kLocalScopes); + pass->SetNotOwned>(kLocalScopes, + &local_scopes); +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; + pass->Erase(kNCCLCtxs); + pass->SetNotOwned(kNCCLCtxs, nctx); #endif + } else if (pass->Type() == "alloc_continuous_space_for_grad_pass") { + pass->Erase(kPlaces); + pass->SetNotOwned>(kPlaces, &places); + pass->Erase(kLocalScopes); + pass->SetNotOwned>(kLocalScopes, + &local_scopes); } else if (pass->Type() == "sequential_execution_pass") { LOG(INFO) << "set enable_sequential_execution:" << enable_sequential_execution_; @@ -227,12 +275,13 @@ std::unique_ptr BuildStrategy::Apply( } // namespace framework } // namespace paddle +USE_PASS(sync_batch_norm_pass); USE_PASS(fuse_relu_depthwise_conv_pass); USE_PASS(fuse_elewise_add_act_pass); USE_PASS(graph_viz_pass); USE_PASS(multi_batch_merge_pass); USE_PASS(reduce_mode_multi_devices_pass); -USE_PASS(allreduce_mode_multi_devices_pass); +USE_PASS(all_reduce_mode_multi_devices_pass); USE_PASS(dist_multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); @@ -242,4 +291,6 @@ USE_PASS(all_reduce_deps_pass); USE_PASS(modify_op_lock_and_record_event_pass); USE_PASS(inplace_pass); USE_PASS(lock_free_optimize_pass); +USE_PASS(alloc_continuous_space_for_grad_pass); USE_PASS(graph_to_program_pass); +USE_PASS(fuse_all_reduce_op_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index d755a2505aead37538bef2b01a193dba87dc1567..4b599fb914dc7c35a0524ea62ba8d458b8dccf8f 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -16,6 +16,7 @@ #include #include +#include #include #include "paddle/fluid/framework/ir/pass_builder.h" @@ -75,8 +76,12 @@ struct BuildStrategy { bool fuse_elewise_add_act_ops_{false}; + bool fuse_all_reduce_ops_{false}; + bool fuse_relu_depthwise_conv_{false}; + bool sync_batch_norm_{false}; + bool memory_optimize_{true}; // TODO(dzhwinter): // make enable_inplace, memory_optimize_ diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 1e3dbb1e44ecb16872e3bf4dee31e31cc69c9818..e98b16e6b3a07bfa0994295306e3bfa9e4174834 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include @@ -31,6 +32,8 @@ class ComputationOpHandle : public OpHandleBase { ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place, size_t scope_idx); + OperatorBase *GetOp() { return op_.get(); } + std::string Name() const override; const Scope *GetScope() const { return scope_; } diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc deleted file mode 100644 index c9b52b68205ade000e21a3d06b80af86cbe01f34..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/data_balance_op_handle.cc +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/data_balance_op_handle.h" -#include -#include "paddle/fluid/framework/details/container_cast.h" - -namespace paddle { -namespace framework { -namespace details { - -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) -DataBalanceOpHandle::DataBalanceOpHandle( - ir::Node *node, const std::vector &local_scopes, - const std::vector &places, - const platform::NCCLContextMap *ctxs) - : OpHandleBase(node), local_scopes_(local_scopes), places_(places) { - if (ctxs) { - for (auto &p : places_) { - this->SetDeviceContext(p, ctxs->DevCtx(p)); - } - } -} -#else -DataBalanceOpHandle::DataBalanceOpHandle( - ir::Node *node, const std::vector &local_scopes, - const std::vector &places) - : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} -#endif - -std::string DataBalanceOpHandle::Name() const { return "data balance"; } - -std::vector> DataBalanceOpHandle::GetBalancePlan( - const std::vector &device_sizes) { - int device_num = device_sizes.size(); - int total_size = 0; - int empty_num = 0; - std::vector> size_device_vec; - size_device_vec.reserve(device_num); - for (int i = 0; i < device_num; ++i) { - if (device_sizes[i] == 0) { - ++empty_num; - } - total_size += device_sizes[i]; - size_device_vec.push_back({{device_sizes[i], i}}); - } - std::vector> res; - if (empty_num == 0) { - // No need to do data balance. - return res; - } - if (total_size < device_num) { - // No enough data. - PADDLE_THROW_EOF(); - } - std::sort(size_device_vec.begin(), size_device_vec.end(), - [](const std::array &a, const std::array &b) { - return a[0] > b[0]; - }); - int expected_device_size = total_size / device_num; - int src_idx = 0; - for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) { - if (size_device_vec[src_idx][0] <= expected_device_size) { - ++src_idx; - PADDLE_ENFORCE_LT( - src_idx, device_num - empty_num, - "In current srategy an empty tensor should not be copy source."); - } - size_device_vec[src_idx][0] -= expected_device_size; - size_device_vec[dst_idx][0] += expected_device_size; - res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1], - expected_device_size}}); - } - return res; -} - -void DataBalanceOpHandle::RunImpl() { - PADDLE_ENFORCE_GT(places_.size(), 1UL, - "Data balance can only be enabled when the number of " - "places to run larger than 1."); - auto in_var_handles = DynamicCast(this->Inputs()); - auto out_var_handles = DynamicCast(this->Outputs()); - PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0); - PADDLE_ENFORCE_EQ( - in_var_handles.size(), out_var_handles.size(), - "The NoDummyInputSize and NoDummyOutputSize should be equal."); - int data_num = in_var_handles.size() / places_.size(); - WaitInputVarGenerated(); - std::vector> lod_tensors(data_num); - std::vector device_sizes; - for (int i = 0; i < static_cast(in_var_handles.size()); ++i) { - PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(), - "The name of input and output should be equal."); - int place_idx = i / data_num; - int data_idx = i % data_num; - auto *local_scope = - local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get(); - auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name()); - PADDLE_ENFORCE(tensor_var->IsType()); - auto *tensor = tensor_var->GetMutable(); - lod_tensors[data_idx].push_back(tensor); - int ins_size = - tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements(); - if (data_idx == 0) { - device_sizes.emplace_back(ins_size); - } else { - PADDLE_ENFORCE_EQ( - ins_size, device_sizes.at(place_idx), - "All data on the same device shall have the same batch size."); - } - } - const auto &balance_plan = GetBalancePlan(device_sizes); - - for (const auto &trans : balance_plan) { - for (int data_idx = 0; data_idx < data_num; ++data_idx) { - LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]]; - LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]]; - int trans_ins_size = trans[2]; - LoD src_lod = src_tensor->lod(); - int src_ins_size = - src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements(); - int cut_point = src_ins_size - trans_ins_size; - if (!src_lod.empty()) { - for (auto &level : src_lod) { - cut_point = level[cut_point]; - } - } - TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]), - dst_tensor->place(), dst_tensor); - src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point)); - if (!src_lod.empty()) { - dst_tensor->set_lod(SliceInLevel( - src_lod, 0, src_ins_size - trans_ins_size, src_ins_size)); - src_tensor->set_lod( - SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size)); - } - } - } -} - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h deleted file mode 100644 index 2db18a1a7203f85aac6338576f2e68c7b37d7c69..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/data_balance_op_handle.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "paddle/fluid/framework/details/op_handle_base.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) -#include "paddle/fluid/platform/nccl_helper.h" -#endif - -namespace paddle { -namespace framework { -namespace details { - -struct DataBalanceOpHandle : public OpHandleBase { - public: -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - DataBalanceOpHandle(ir::Node *node, const std::vector &local_scopes, - const std::vector &places, - const platform::NCCLContextMap *ctxs); -#else - DataBalanceOpHandle(ir::Node *node, const std::vector &local_scopes, - const std::vector &places); -#endif - - std::string Name() const override; - - bool IsMultiDeviceTransfer() override { return false; }; - - protected: - void RunImpl() override; - - private: - // std::vector<(src_dev_id, dst_dev_id, trans_size)> - std::vector> GetBalancePlan( - const std::vector &batch_size_per_device); - - const std::vector local_scopes_; - const std::vector places_; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 03fbfd7f24a8a987db72f45be777acc7ece577a6..dbc90737f2286db6e74d3271f39d004c25e4a949 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -12,6 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include +#include + #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/scope.h" @@ -45,6 +49,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( } } #endif + PADDLE_ENFORCE(!var_names_.empty(), "Var names cannot be empty"); } EagerDeletionOpHandle::~EagerDeletionOpHandle() { @@ -60,15 +65,20 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } void EagerDeletionOpHandle::RunImpl() { - auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + Scope *exec_scope = nullptr; std::deque> garbages; for (auto &name : var_names_) { auto it = ref_cnts_->find(name); - // Var not found, not reference count has not decreased to 0 + // Reference count has not decreased to 0 if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) { continue; } + if (!exec_scope) { + exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + } + + // Var not found auto *var = exec_scope->FindVar(name); if (var == nullptr) { continue; diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index 4e42d0b4972d567dd769cad6ff8b9d45380ab77a..a6baa26134cf36ea93dde554f808e73fa0c30b93 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -12,20 +12,168 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include #include +#include #include #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" -#include "paddle/fluid/framework/details/eager_deletion_pass.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { namespace framework { namespace details { +// op -> variables which can be deleted after op runs +using OpToVarNameSetMap = + std::unordered_map>; + +// Check whether the variable is LoDTensor based on static VarDesc info +static bool IsLoDTensor(VarDesc *var) { + return var->Proto()->type().type() == proto::VarType::LOD_TENSOR; +} + +// Get memory size of LoDTensor +static int64_t GetMemorySize( + const std::unordered_map> &vars, + const std::string &var_name) { + auto *var_desc = TryGetLatestVarDesc(vars.at(var_name)); + PADDLE_ENFORCE_NOT_NULL(var_desc); + PADDLE_ENFORCE(IsLoDTensor(var_desc)); + auto dims = var_desc->GetShape(); + return SizeOfType(var_desc->GetDataType()) * + std::accumulate(dims.begin(), dims.end(), static_cast(1), + std::multiplies()); +} + +// Split all variables in the graph into LoDTensor and Non-LoDTensor (e.g. +// SelectedRows, LoDTensorArray) +// Since partial GC is based on static analysis of memory size of each variable +// So we should skip SelectedRows and LoDTensorArray here +static void SplitIntoLoDTensorAndNonLoDTensorVars( + const OpToVarNameSetMap &m, const GraphVars &vars, + OpToVarNameSetMap *lod_tensors, OpToVarNameSetMap *other_vars) { + lod_tensors->clear(); + other_vars->clear(); + + for (auto &op_vars_pair : m) { + for (auto &var_name : op_vars_pair.second) { + auto *var_desc = TryGetLatestVarDesc( + vars[op_vars_pair.first->GetScopeIdx()].at(var_name)); + if (IsLoDTensor(var_desc)) { + (*lod_tensors)[op_vars_pair.first].insert(var_name); + } else { + (*other_vars)[op_vars_pair.first].insert(var_name); + } + } + } +} + +struct GCVarInfo { + GCVarInfo(const std::string &name, int64_t memory_size, + ComputationOpHandle *op, size_t scope_idx) + : name_(name), + memory_size_(memory_size), + op_(op), + scope_idx_(scope_idx) {} + + std::string name_; // variable name + int64_t memory_size_; // memory size + ComputationOpHandle *op_; // op after which the variable could be deleted + size_t scope_idx_; // scope index where the variable locates + + int64_t AbsMemorySize() const { return std::abs(memory_size_); } +}; + +// Delete delete_lod_tensor_only is not used currently +static OpToVarNameSetMap ShrinkGCVars( + const OpToVarNameSetMap &m, const GraphVars &vars, + const std::vector &places, double fraction_of_memory_size, + bool delete_lod_tensor_only = false) { + // Do not perform gc when fraction_of_memory_size = 0 + if (fraction_of_memory_size <= 0.0) return {}; + + /** + * Step 1: Split all variables into LoDTensor and Non-LoDTensor. + * We can only calculate memory size of LoDTensors + */ + OpToVarNameSetMap lod_tensors, other_vars; + SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars); + + // Perform complete gc when fraction_of_memory_size >= 1 + if (fraction_of_memory_size >= 1.0) { + return delete_lod_tensor_only ? lod_tensors : m; + } + + /** + * Step 2: build GCVarInfos, and calculate total memory sizes of each device + */ + + // place -> variable info (name, memory size, place, scope_idx) + std::map> place_to_vars; + + // place -> total memory sizes + std::map place_to_size; + for (auto &op_vars_pair : lod_tensors) { + auto *op = op_vars_pair.first; + auto &var_names = op_vars_pair.second; + auto scope_idx = op->GetScopeIdx(); + auto &place = places[scope_idx]; + + for (auto &var_name : var_names) { + auto var_size = GetMemorySize(vars[scope_idx], var_name); + GCVarInfo var_info(var_name, var_size, op, scope_idx); + place_to_size[place] += var_info.AbsMemorySize(); + place_to_vars[place].emplace_back(std::move(var_info)); + } + } + + /** + * Step 3: sort GCVarInfos, and only delete the largest variables. + */ + OpToVarNameSetMap partial_vars; + for (auto &place_to_var_pair : place_to_vars) { + auto &place = place_to_var_pair.first; + auto &gc_vars = place_to_var_pair.second; + std::sort(gc_vars.begin(), gc_vars.end(), + [](const GCVarInfo &var1, const GCVarInfo &var2) { + return var1.AbsMemorySize() > var2.AbsMemorySize(); + }); + + int64_t accumulated_size = 0; + int64_t size_threshold = + static_cast(fraction_of_memory_size * place_to_size[place]); + for (size_t i = 0; i < gc_vars.size() && accumulated_size < size_threshold; + ++i) { + partial_vars[gc_vars[i].op_].insert(gc_vars[i].name_); + accumulated_size += gc_vars[i].AbsMemorySize(); + } + } + + /** + * Step 4: Combine other vars (SelectedRows, LoDTensorArray) + */ + if (!delete_lod_tensor_only) { + for (auto &op_vars_pair : other_vars) { + partial_vars[op_vars_pair.first].insert(op_vars_pair.second.begin(), + op_vars_pair.second.end()); + } + } + + return partial_vars; +} + +class EagerDeletionPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + std::unique_ptr EagerDeletionPass::ApplyImpl( std::unique_ptr graph) const { auto &ref_cnts = @@ -43,9 +191,7 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( // a reverse map of last_live_ops // i.e., last op --> variable names which can be deleted. - std::unordered_map> - op_vars_map; - + OpToVarNameSetMap op_vars_map; for (auto &var_ops_map : last_live_ops) { for (auto &var_ops_pair : var_ops_map) { const std::string &var_name = var_ops_pair.first; @@ -55,6 +201,10 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( } } + double memory_fraction = framework::GetEagerDeletionMemoryFraction(); + + op_vars_map = ShrinkGCVars(op_vars_map, vars, places, memory_fraction); + for (auto &pair : op_vars_map) { auto *op = pair.first; auto &var_names = pair.second; @@ -85,8 +235,12 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( eager_deletion_op->AddOutput(dummy_leaf); } + VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = " << memory_fraction; VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)"; - return graph; + + auto while_op_eager_deletion_pass = + ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass"); + return while_op_eager_deletion_pass->Apply(std::move(graph)); } } // namespace details @@ -99,3 +253,5 @@ REGISTER_PASS(eager_deletion_pass, .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars) .RequirePassAttr(paddle::framework::details::kAllPlaces) .RequirePassAttr(paddle::framework::details::kGarbageCollector); + +USE_PASS(while_op_eager_deletion_pass); diff --git a/paddle/fluid/framework/details/early_delete_op_handle.h b/paddle/fluid/framework/details/early_delete_op_handle.h deleted file mode 100644 index c8382d34b790ba7c95415acdf0b55dc97a9cd265..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/early_delete_op_handle.h +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include "paddle/fluid/framework/details/computation_op_handle.h" -#include "paddle/fluid/framework/details/op_handle_base.h" -#include "paddle/fluid/framework/details/var_handle.h" -#include "paddle/fluid/framework/garbage_collector.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor.h" - -namespace paddle { -namespace framework { -namespace details { - -class EarlyDeleteOpHandle : public OpHandleBase { - public: - EarlyDeleteOpHandle(ir::Node* node, const Scope* scope, - const platform::Place& place, - const std::vector& names, - GarbageCollector* gc) - : OpHandleBase(node), - scope_(scope), - place_(place), - names_(names), - gc_(gc) { -#ifdef PADDLE_WITH_CUDA - if (IsStreamGarabageCollector()) { - auto gpu_place = boost::get(place); - PADDLE_ENFORCE(cudaSetDevice(gpu_place.device)); - PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); - } -#endif - } - ~EarlyDeleteOpHandle() { -#ifdef PADDLE_WITH_CUDA - if (IsStreamGarabageCollector()) { - auto gpu_place = boost::get(dev_ctx_->GetPlace()); - PADDLE_ENFORCE(cudaSetDevice(gpu_place.device)); - PADDLE_ENFORCE(cudaEventDestroy(event_)); - } -#endif - } - - std::string Name() const override { return "early_delete"; } - - protected: - void RunImpl() override { - std::vector> tensors; - auto* local_scope = scope_->FindVar(kLocalExecScopeName)->Get(); - for (auto& var_name : names_) { - auto* var = local_scope->FindVar(var_name); - PADDLE_ENFORCE(var != nullptr, - string::Sprintf("Local Scope not has var %s", var_name)); - if (var->IsType()) { - tensors.emplace_back(var->GetMutable()->MoveMemoryHolder()); - } else if (var->IsType()) { - tensors.emplace_back(var->GetMutable() - ->mutable_value() - ->MoveMemoryHolder()); - } else if (var->IsType()) { - LoDTensorArray* tensor_array = var->GetMutable(); - for (auto& tensor : *tensor_array) { - tensors.emplace_back(tensor.MoveMemoryHolder()); - } - } - } - if (!tensors.empty()) { - ClearTensors(tensors); - } - } - - private: - void ClearTensors( - const std::vector>& tensors) { - if (platform::is_cpu_place(place_)) { - ClearCPUTensors(tensors); - } else { - ClearGPUTensors(tensors); - } - } - - void ClearCPUTensors( - const std::vector>& tensors) { - auto* gc = dynamic_cast(gc_); - if (gc != nullptr) { - gc->Add(tensors); - } - } - - void ClearGPUTensors( - const std::vector>& tensors) { -#ifdef PADDLE_WITH_CUDA - auto* gc = dynamic_cast(gc_); - if (gc != nullptr) { - auto compute_stream = dev_ctx_->stream(); - auto callback_stream = gc->stream(); - auto callback_func = [=]() { - PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); - PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); - }; - gc_->Add(tensors, callback_func); - } else { - gc_->Add(tensors); - } - } - - bool IsStreamGarabageCollector() const { - return dynamic_cast(gc_) != nullptr; -#endif - } - - const Scope* scope_; - const platform::Place place_; - std::vector names_; - GarbageCollector* gc_; -#ifdef PADDLE_WITH_CUDA - platform::CUDADeviceContext* dev_ctx_; - cudaEvent_t event_; -#endif -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/fetch_barrier_op_handle.cc b/paddle/fluid/framework/details/fetch_barrier_op_handle.cc new file mode 100644 index 0000000000000000000000000000000000000000..019ecfbb61028537692c8fdeb874c6c490f75430 --- /dev/null +++ b/paddle/fluid/framework/details/fetch_barrier_op_handle.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fetch_barrier_op_handle.h" + +#include + +namespace paddle { +namespace framework { +namespace details { +FetchBarrierOpHandle::FetchBarrierOpHandle( + ir::Node *node, const std::vector &local_scopes, + const std::vector &places) + // fetch_barrier op always run on place0, but output on all places. + : OpHandleBase(node), + op_(framework::OpRegistry::CreateOp(*node->Op())), + local_scopes_(local_scopes), + places_(places), + run_scope_(local_scopes[0]), + place_(places[0]) { + for (auto &p : places) { + this->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); + } +} + +bool FetchBarrierOpHandle::IsMultiDeviceTransfer() { + // override IsMultiDeviceTransfer to return true + return true; +} + +void FetchBarrierOpHandle::RunImpl() { + WaitInputVarGenerated(place_); + + auto run_func = [this]() { + op_->Run(*run_scope_->FindVar(kLocalExecScopeName)->Get(), place_); + }; + + if (is_lock_and_record_event_free_) { + run_func(); + } else { + this->RunAndRecordEvent(run_func); + } +} + +bool FetchBarrierOpHandle::NeedWait(VarHandleBase *in_var) { + bool need_wait = + in_var && in_var->GeneratedOp() && + in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_); + return need_wait; +} + +std::string FetchBarrierOpHandle::Name() const { return op_->Type(); } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fetch_barrier_op_handle.h similarity index 52% rename from paddle/fluid/framework/details/fuse_vars_op_handle.h rename to paddle/fluid/framework/details/fetch_barrier_op_handle.h index b40b01df36479543e8b2779762210ae144d7d9be..b4d12785e0345c887f179bc53c8446dc1438f889 100644 --- a/paddle/fluid/framework/details/fuse_vars_op_handle.h +++ b/paddle/fluid/framework/details/fetch_barrier_op_handle.h @@ -14,13 +14,13 @@ #pragma once -#include +#include #include #include -#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/op_handle_base.h" -#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/device_context.h" @@ -28,38 +28,34 @@ namespace paddle { namespace framework { namespace details { -struct FuseVarsOpHandle : public OpHandleBase { +// **NOTE**: fetch_barrier op is special it outputs all recved variables on +// all places if there are multiple places, must init with +// multiple dev_ctxes_ !!!! + +struct FetchBarrierOpHandle : public OpHandleBase { public: - FuseVarsOpHandle(ir::Node *node, Scope *local_scope, - const platform::Place &place, - const std::unordered_map &inputs_numel, - const proto::VarType::Type var_type) - : OpHandleBase(node), - local_scope_(local_scope), - place_(place), - inputs_numel_(inputs_numel), - type_(var_type) { - total_numel_ = 0; - for (auto in_numel : inputs_numel) { - PADDLE_ENFORCE_GT(in_numel.second, 0); - total_numel_ += in_numel.second; - } - } + FetchBarrierOpHandle(ir::Node *node, const std::vector &local_scopes, + const std::vector &places); - std::string Name() const override; + bool IsMultiDeviceTransfer() override; - bool IsMultiDeviceTransfer() override { return false; }; + std::string Name() const override; protected: void RunImpl() override; + bool NeedWait(VarHandleBase *in_var) override; + private: - Scope *local_scope_; - const platform::Place place_; - const std::unordered_map inputs_numel_; - const proto::VarType::Type type_; - int64_t total_numel_; + std::unique_ptr op_; + std::vector local_scopes_; + std::vector places_; + Scope *run_scope_; + platform::Place place_; + + bool is_lock_and_record_event_free_{false}; }; + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index bbf81e1b8e49cae133858f7aa121701fb0f5456f..232d82a5da596a78d2999c4a4c4f7dda0c7cad7e 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -82,6 +82,8 @@ void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) { } } +bool FetchOpHandle::IsMultiDeviceTransfer() { return true; } + std::string FetchOpHandle::Name() const { return "Fetch"; } } // namespace details diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h index 6ce42f92d7f1e81eeafd1eb5c28ce3564a5ffebc..dbb7f4f6582f6e0f0b9b5702533852d12da1051c 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.h +++ b/paddle/fluid/framework/details/fetch_op_handle.h @@ -39,6 +39,8 @@ struct FetchOpHandle : public OpHandleBase { std::string Name() const override; + bool IsMultiDeviceTransfer() override; + protected: void RunImpl() override; diff --git a/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..f226491c9f5355d0418d672069b19a78ef53c595 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc @@ -0,0 +1,195 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/details/all_reduce_op_handle.h" +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +class FuseAllReduceOpPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override { + ir::Graph &result = *graph; + + auto &places = Get>(kPlaces); + auto &local_scopes = Get>(kLocalScopes); +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + auto *nccl_ctxs = &Get(kNCCLCtxs); +#endif + + std::unordered_set grads; + auto ¶ms_grads = result.Get(kParamsAndGrads); + size_t num_of_all_reduce = params_grads.size(); + grads.reserve(num_of_all_reduce); + for (auto p_g : params_grads) { + grads.insert(p_g.second); + } + + size_t num_place = places.size(); + std::unordered_map all_reduce_ops; + all_reduce_ops.reserve(grads.size()); + for (auto &node : result.Nodes()) { + if (node->IsOp()) { + PADDLE_ENFORCE(node->IsWrappedBy()); + auto *all_reduce_op_handle = + dynamic_cast(&node->Wrapper()); + if (all_reduce_op_handle) { + auto inputs = DynamicCast(all_reduce_op_handle->Inputs()); + PADDLE_ENFORCE_EQ(inputs.size(), num_place); + // The inputs' name should be the same. + auto &grad_name = inputs[0]->name(); + for (size_t i = 1; i < inputs.size(); ++i) { + PADDLE_ENFORCE_EQ(inputs[i]->name(), grad_name, + "The input name should be the same."); + } + PADDLE_ENFORCE_NE(grads.count(grad_name), static_cast(0)); + all_reduce_ops.emplace(grad_name, node); + } + } + } + + VLOG(10) << "Find all_reduce_ops: " << all_reduce_ops.size(); + if (all_reduce_ops.size() == 0) { + return std::move(graph); + } + + PADDLE_ENFORCE_EQ(all_reduce_ops.size(), grads.size(), + "The number of all_reduce OpHandle is not equal to the " + "number of grads. Maybe some gradients are sparse type, " + "it is not supported currently."); + VLOG(10) << "Insert fused_all_reduce"; + + auto &group_grads_params = + graph->Get(kGroupGradsAndParams); + + for (auto &group_g_p : group_grads_params) { + size_t group_size = group_g_p.size(); + PADDLE_ENFORCE_GT(group_size, static_cast(0)); + std::vector group_all_reduce_ops; + group_all_reduce_ops.reserve(group_size); + for (auto &g_p : group_g_p) { + group_all_reduce_ops.emplace_back(all_reduce_ops.at(g_p.first)); + } +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + InsertFusedAllReduce(places, local_scopes, group_size, + group_all_reduce_ops, nccl_ctxs, &result); +#else + InsertFusedAllReduce(places, local_scopes, group_size, + group_all_reduce_ops, &result); +#endif + } + return std::move(graph); + } + + void InsertFusedAllReduce(const std::vector &places, + const std::vector &local_scopes, + const size_t num_of_all_reduce, + const std::vector &all_reduce_ops, +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + const platform::NCCLContextMap *nccl_ctxs, +#endif + ir::Graph *result) const { + std::vector inputs; + std::vector outputs; + for (auto &op : all_reduce_ops) { + auto &op_handle = op->Wrapper(); + inputs.insert(inputs.end(), op_handle.Inputs().begin(), + op_handle.Inputs().end()); + // Remove output + for_each(op_handle.Inputs().begin(), op_handle.Inputs().end(), + [&op_handle](VarHandleBase *var_handle) { + var_handle->RemoveOutput(&op_handle, op_handle.Node()); + }); + + outputs.insert(outputs.end(), op_handle.Outputs().begin(), + op_handle.Outputs().end()); + // Remove Input + for_each( + op_handle.Outputs().begin(), op_handle.Outputs().end(), + [](VarHandleBase *var_handle) { var_handle->ClearGeneratedOp(); }); + + result->RemoveNode(op_handle.Node()); + } + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places, + local_scopes, nccl_ctxs, result); +#else + CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places, + local_scopes, result); +#endif + } + + private: + void CreateFusedAllReduceOp(const std::vector &inputs, + const std::vector &outputs, + const size_t num_of_all_reduce, + const std::vector &places, + const std::vector &local_scopes, +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + const platform::NCCLContextMap *nccl_ctxs, +#endif + ir::Graph *result) const { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + auto *op_handle = new FusedAllReduceOpHandle( + result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation), + local_scopes, places, num_of_all_reduce, nccl_ctxs); +#else + auto *op_handle = new FusedAllReduceOpHandle( + result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation), + local_scopes, places, num_of_all_reduce); +#endif + + for (auto in : inputs) { + op_handle->AddInput(in); + } + + for (auto out : outputs) { + op_handle->AddOutput(out); + } + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + if (!nccl_ctxs) { + SetCommunicationContext(places, op_handle); + } +#else + SetCommunicationContext(places, op_handle); +#endif + } + + void SetCommunicationContext(const std::vector &places, + FusedAllReduceOpHandle *op_handle) const { + for (size_t i = 0; i < places.size(); ++i) { + op_handle->SetDeviceContext( + places[i], platform::DeviceContextPool::Instance().Get(places[i])); + } + } +}; + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_all_reduce_op_pass, + paddle::framework::details::FuseAllReduceOpPass); diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc deleted file mode 100644 index 14292c0a5d06aa3ff12b46b5768b136fa925752d..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/fuse_vars_op_handle.h" - -namespace paddle { -namespace framework { -namespace details { - -void FuseVarsOpHandle::RunImpl() { - WaitInputVarGenerated(place_); - - auto in_var_handles = DynamicCast(this->Inputs()); - auto out_var_handles = DynamicCast(this->Outputs()); - PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL); - PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), ""); - - auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get(); - - auto out_var_handle = out_var_handles[0]; - auto out_var = scope->Var(out_var_handle->name()); - - auto out_tensor = out_var->GetMutable(); - out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_); - - int64_t s = 0; - for (size_t i = 1; i < out_var_handles.size(); ++i) { - auto out_name = out_var_handles[i]->name(); - auto out_t = scope->Var(out_name)->GetMutable(); - auto numel = this->inputs_numel_.at(out_name); - out_t->ShareDataWith(out_tensor->Slice(s, s + numel)); - s += numel; - } - this->RunAndRecordEvent([] {}); -} - -std::string FuseVarsOpHandle::Name() const { return "fuse vars"; } -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc new file mode 100644 index 0000000000000000000000000000000000000000..644cd4e15083519d6c685ae3e6a0737692018a07 --- /dev/null +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -0,0 +1,249 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h" +#include +#include +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/reduce_and_gather.h" +#include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/profiler.h" + +DEFINE_bool(skip_fused_all_reduce_check, false, ""); +namespace paddle { +namespace framework { +namespace details { + +typedef std::vector>> + GradientAndLoDTensor; + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +FusedAllReduceOpHandle::FusedAllReduceOpHandle( + ir::Node *node, const std::vector &local_scopes, + const std::vector &places, const size_t num_of_all_reduce, + const platform::NCCLContextMap *ctxs) + : OpHandleBase(node), + local_scopes_(local_scopes), + places_(places), + num_of_all_reduce_(num_of_all_reduce), + nccl_ctxs_(ctxs) { + if (nccl_ctxs_) { + for (auto &p : places_) { + this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p)); + } + } + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); +} +#else + +FusedAllReduceOpHandle::FusedAllReduceOpHandle( + ir::Node *node, const std::vector &local_scopes, + const std::vector &places, const size_t num_of_all_reduce) + : OpHandleBase(node), + local_scopes_(local_scopes), + places_(places), + num_of_all_reduce_(num_of_all_reduce) { + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); +} + +#endif + +void FusedAllReduceOpHandle::RunImpl() { + platform::RecordEvent record_event(Name()); + + VLOG(4) << this->DebugString(); + + WaitInputVarGenerated(); + // The input: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)... + // The output: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)... + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + + size_t place_num = places_.size(); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), place_num * num_of_all_reduce_, + "The NoDummyInputSize should be equal to the number of places."); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); + + GradientAndLoDTensor grads_tensor; + grads_tensor.resize(place_num); + + int64_t numel = -1; + auto dtype = static_cast(0); + for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) { + auto &g_tensor = grads_tensor.at(scope_idx); + g_tensor.reserve(num_of_all_reduce_); + + GetGradLoDTensor(scope_idx, in_var_handles, out_var_handles, &g_tensor); + + int64_t element_num = 0; + framework::proto::VarType::Type ele_dtype = + static_cast(0); + GetDTypeAndNumel(g_tensor, &ele_dtype, &element_num); + + if (numel == -1) { + numel = element_num; + } + if (dtype == static_cast(0)) { + dtype = ele_dtype; + PADDLE_ENFORCE_NE(ele_dtype, + static_cast(0)); + } + PADDLE_ENFORCE_EQ(ele_dtype, dtype); + + // Check whether the address space is contiguous. + std::sort( + g_tensor.begin(), g_tensor.end(), + [](const std::pair &grad1, + const std::pair &grad2) -> bool { + return grad1.second->data() < grad2.second->data(); + }); + + for (size_t k = 1; k < g_tensor.size(); ++k) { + const void *cur_address = g_tensor.at(k - 1).second->data(); + int64_t len = g_tensor.at(k - 1).second->numel(); + auto offset = len * framework::SizeOfType(dtype); + void *infer_next_address = reinterpret_cast( + reinterpret_cast(cur_address) + offset); + const void *next_address = g_tensor.at(k).second->data(); + + VLOG(10) << string::Sprintf( + "Input[%d](%s) address: 0X%02x, Input[%d](%s) address: 0X%02x, Infer " + "input[%d] address: 0X%02x. The offset: %d", + k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k, + next_address, k, infer_next_address, offset); + PADDLE_ENFORCE_EQ(infer_next_address, next_address, + "The address is not consistent."); + } + } + + if (!FLAGS_skip_fused_all_reduce_check) { + for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) { + for (size_t j = 1; j < num_of_all_reduce_; ++j) { + PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first, + grads_tensor.at(scope_idx).at(j).first); + } + } + } + + std::vector lod_tensor_data; + for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) { + auto data = grads_tensor.at(scope_idx).at(0).second->data(); + lod_tensor_data.emplace_back(data); + } + + if (platform::is_gpu_place(places_[0])) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); + int nccl_dtype = platform::ToNCCLDataType(dtype); + std::vector> all_reduce_calls; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &p = places_[i]; + void *buffer = const_cast(lod_tensor_data.at(i)); + + int dev_id = boost::get(p).device; + auto &nccl_ctx = nccl_ctxs_->at(dev_id); + auto stream = nccl_ctx.stream(); + auto comm = nccl_ctx.comm_; + all_reduce_calls.emplace_back([=] { + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + buffer, buffer, numel, static_cast(nccl_dtype), + ncclSum, comm, stream)); + }); + } + + this->RunAndRecordEvent([&] { + if (all_reduce_calls.size() == 1UL) { + // Do not use NCCLGroup when manage NCCL by per thread per device + all_reduce_calls[0](); + } else { + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); + } + } + }); +#else + PADDLE_THROW("Not compiled with CUDA"); +#endif + } else { + // Special handle CPU only Operator's gradient. Like CRF + auto grad_name = grads_tensor.at(0).at(0).first; + auto &trg = *this->local_scopes_[0] + ->FindVar(kLocalExecScopeName) + ->Get() + ->FindVar(grad_name) + ->GetMutable(); + + // Reduce All data to trg in CPU + ReduceBufferData func(lod_tensor_data, trg.data(), numel); + VisitDataType(trg.type(), func); + + for (size_t i = 1; i < local_scopes_.size(); ++i) { + auto &scope = + *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); + auto &p = places_[i]; + auto *var = scope.FindVar(grad_name); + auto *dev_ctx = dev_ctxes_.at(p); + size_t size = numel * SizeOfType(trg.type()); + RunAndRecordEvent(p, [&trg, var, dev_ctx, p, size] { + auto dst_ptr = var->GetMutable()->data(); + platform::CPUPlace cpu_place; + memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data(), size); + }); + } + } +} + +void FusedAllReduceOpHandle::GetGradLoDTensor( + const size_t &scope_idx, const std::vector &in_var_handles, + const std::vector &out_var_handles, + std::vector> *grad_tensor) const { + auto *local_scope = + local_scopes_.at(scope_idx)->FindVar(kLocalExecScopeName)->Get(); + size_t place_num = places_.size(); + + for (size_t j = 0; j < in_var_handles.size(); j += place_num) { + auto var_name = in_var_handles[j]->name(); + PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name()); + auto &lod_tensor = local_scope->FindVar(var_name)->Get(); + PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx)); + grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor)); + } +} + +void FusedAllReduceOpHandle::GetDTypeAndNumel( + const std::vector> &grad_tensor, + proto::VarType::Type *dtype, int64_t *numel) const { + *numel = 0; + for (size_t i = 0; i < grad_tensor.size(); ++i) { + // Get element number + int64_t len = grad_tensor.at(i).second->numel(); + PADDLE_ENFORCE_GT(len, 0); + *numel += len; + + // Get dtype + auto ele_type = grad_tensor.at(i).second->type(); + if (i == 0) { + *dtype = ele_type; + } + PADDLE_ENFORCE_EQ(ele_type, *dtype); + } +} + +std::string FusedAllReduceOpHandle::Name() const { return "fused_all_reduce"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..79772c61f8c8b7abe3cf26dd8a94c2acdc0872a0 --- /dev/null +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h @@ -0,0 +1,76 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace framework { +namespace details { + +struct FusedAllReduceOpHandle : public OpHandleBase { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + FusedAllReduceOpHandle(ir::Node *node, + const std::vector &local_scopes, + const std::vector &places, + const size_t num_of_all_reduce, + const platform::NCCLContextMap *ctxs); +#else + FusedAllReduceOpHandle(ir::Node *node, + const std::vector &local_scopes, + const std::vector &places, + const size_t num_of_all_reduce); +#endif + std::string Name() const override; + + // Delay and buffer nccl_all_reduce together can significantly increase + // performance. Disable this feature by returning false. + bool IsMultiDeviceTransfer() override { return true; }; + + protected: + void RunImpl() override; + + private: + std::vector local_scopes_; + std::vector places_; + size_t num_of_all_reduce_; +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + const platform::NCCLContextMap *nccl_ctxs_; +#endif + + // Check the dtype of the input + void GetDTypeAndNumel( + const std::vector> &g_tensor, + proto::VarType::Type *dtype, int64_t *total_num) const; + + // Get gradient's name and LoDTensor + void GetGradLoDTensor(const size_t &scope_idx, + const std::vector &in_var_handles, + const std::vector &out_var_handles, + std::vector> + *grad_tensor) const; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/graph_test_base.h b/paddle/fluid/framework/details/graph_test_base.h index 126959bcd80a4677f76b7cff677a82a319f7cfb3..d139f8488309eecf89c924a346ab0e574edc86dc 100644 --- a/paddle/fluid/framework/details/graph_test_base.h +++ b/paddle/fluid/framework/details/graph_test_base.h @@ -68,11 +68,11 @@ class SplitOpMaker : public OpProtoAndCheckerMaker { class DummyVarTypeInference : public VarTypeInference { public: - void operator()(const OpDesc& op_desc, BlockDesc* block) const override { - auto& inputs = op_desc.Input("X"); - auto type = block->Var(inputs.front())->GetType(); - auto out_var_name = op_desc.Output("Out").front(); - block->Var(out_var_name)->SetType(type); + void operator()(framework::InferVarTypeContext* ctx) const override { + auto& inputs = ctx->Input("X"); + auto type = ctx->GetType(inputs.front()); + auto out_var_name = ctx->Output("Out").front(); + ctx->SetType(out_var_name, type); } }; diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc index c91fc81b2defc9fe6b5720ce652a9aa94b27735e..88f26b41618e4b74766f2caa00ad29fd912f48f9 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.cc +++ b/paddle/fluid/framework/details/inplace_op_pass.cc @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -147,12 +150,14 @@ std::unique_ptr InplacePass::ApplyImpl( view_.Build(graph.get()); InitSSAGraphNodes(); + auto cnt = 0; for (auto* op : view_.AllOps()) { + VLOG(4) << "Handle op " << cnt++ << ": " << op->Name(); if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name())) continue; TryInplaceOpInputOutput(op, graph.get()); } - graph->ResolveHazard(var_nodes_); + // graph->ResolveHazard(var_nodes_); return graph; } @@ -263,9 +268,10 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes, void InplacePass::TryInplaceOpInputOutput(ir::Node* op, ir::Graph* graph) const { VLOG(4) << "Try to inplace op " << op->Name(); - PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr, - "op_desc is nullptr"); + // PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr, + // "op_desc is nullptr"); // some pre-requirments need to meet if the op want to inplaced. + PADDLE_ENFORCE(op->Op() != nullptr, "op_desc is nullptr"); auto* op_desc = op->Op(); auto& infer_inplace = @@ -276,21 +282,58 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op, PADDLE_ENFORCE(static_cast(infer_inplace), "%s's infer_inplace has not been registered", op_desc->Type()); - auto* block = op_desc->Block(); - auto in_to_outs = infer_inplace(*op_desc, block); + auto in_to_outs = infer_inplace(*op_desc); auto& all_ops = view_.AllOps(); auto cursor = std::find(all_ops.begin(), all_ops.end(), op); size_t idx = std::distance(all_ops.begin(), cursor); for (auto& pair : in_to_outs) { - auto& in_var_name = pair.first; - auto& out_var_name = pair.second; + auto& in_para_name = pair.first; + auto& out_para_name = pair.second; + + auto input_vars = op->Op()->Input(in_para_name); + if (!input_vars.size()) { + VLOG(4) << "Parameter " << in_para_name << " is empty skip " + << in_para_name << " => " << out_para_name << " pair"; + continue; + } + auto output_vars = op->Op()->Output(out_para_name); + if (!output_vars.size()) { + VLOG(4) << "Parameter " << out_para_name << " is empty skip " + << in_para_name << " => " << out_para_name << " pair"; + continue; + } + auto in_var_name = input_vars.at(0); + auto out_var_name = output_vars.at(0); auto* in_node = view_.GetNodeByName(in_var_name, op->inputs); auto* out_node = view_.GetNodeByName(out_var_name, op->outputs); + VLOG(4) << "Try to inplace " << in_var_name << " with " << out_var_name; + + bool can_replace = true; + if (in_var_name == out_var_name) { + can_replace = false; + VLOG(4) << "SKIP: Input variable " << in_var_name << " & Output variable " + << out_var_name << " are the same"; + } else if (!NodeCanReused(in_node)) { + can_replace = false; + VLOG(4) << "SKIP: Input varialbe " << in_var_name << "cannot be reused"; + } else if (!NodeCanReused(out_node)) { + can_replace = false; + VLOG(4) << "SKIP: Output variable " << out_var_name + << " cannot be reused"; + } else if (details::NodeSize(*in_node->Var()) != + details::NodeSize(*out_node->Var())) { + can_replace = false; + VLOG(4) << "SKIP: Input and Output varialbe size not match"; + } + + if (!can_replace) continue; + // 2. there is no external pending op on the input node - if (view_.PendingOpsOnVar(in_node).size() > 1) { + // if (view_.PendingOpsOnVar(in_node).size() > 1) { + if (in_node->outputs.size() > 1 && !view_.CheckDeps(in_node, op)) { VLOG(4) << string::Sprintf( "Skiped pair %s => %s. %s input has external dependency." "inplace such pair will overwrite the memory.", @@ -337,6 +380,97 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op, } } +void GraphView::TopoSort(ir::Graph* graph) { + // + ops_.clear(); + auto deps_num = [](ir::Node* op) { + auto cnt = 0; + for (auto& var : op->inputs) + if (var->inputs.size() > 0) ++cnt; + return cnt; + }; + + std::queue> ready_ops; + + int level = 0; + auto nodes = graph->Nodes(); + std::unordered_map deps_map; + for (auto& node : nodes) { + if (node->IsOp() && node->Op() != nullptr) { + deps_map[node] = deps_num(node); + if (0 == deps_map[node]) { + ready_ops.push({node, level}); + } + } + } + + while (!ready_ops.empty()) { + auto item = ready_ops.front(); + ready_ops.pop(); + + ops_.emplace_back(item.first); + // record level when pop from queue + op_level_[item.first] = item.second; + + for (auto node : item.first->outputs) { + for (auto op : node->outputs) { + --deps_map[op]; + if (deps_map[op] == 0) ready_ops.push({op, item.second + 1}); + } + } + } + + bool all_ops_checked = true; + for (auto& node : nodes) { + if (node->IsOp() && node->Op() != nullptr && deps_map[node] > 0) { + all_ops_checked = false; + break; + } + } + + PADDLE_ENFORCE(all_ops_checked, "All ops deps should be 0 after analysis"); +} + +// return true if current op node depeneds on all other op that use the same +// variable node +bool GraphView::CheckDeps(ir::Node* var, ir::Node* current_op) const { + // get op list that rely on the same variable + auto op_list = var->outputs; + for (auto& op : op_list) { + if (op == current_op) continue; + + VLOG(4) << " GraphView::CheckDeps : " << op->Name() << " & " + << current_op->Name(); + if (!CheckOpDeps(op, current_op)) return false; + VLOG(4) << ""; + } + return true; +} + +// check if op2 depends on op1's output +bool GraphView::CheckOpDeps(ir::Node* op1, ir::Node* op2) const { + auto print_op = [&](ir::Node* op, const char* name) { + std::ostringstream os; + os << " " << name << " : " << op->Name() << " "; + os << "Input args : "; + for (auto& arg : op->inputs) os << arg->Name() << " "; + os << "Output args : "; + for (auto& arg : op->outputs) os << arg->Name() << " "; + os << "Level : " << op_level_.at(op); + VLOG(4) << os.str(); + }; + print_op(op1, "OP1"); + print_op(op2, "OP2"); + + if (op1 == op2) return true; + if (op_level_.at(op1) >= op_level_.at(op2)) return false; + + for (auto& var : op2->inputs) + if (var->inputs.size() > 0 && CheckOpDeps(op1, var->inputs[0])) return true; + + return false; +} + ir::Node* GraphView::GetNodeByName(const std::string& name, const std::vector& nodes) const { // nodes should be op->inputs/outputs @@ -382,22 +516,7 @@ void GraphView::Build(ir::Graph* g) { // Because we insert some new created node. Which may have data race between // nodes. // resolve data harzards depends on the var nodes in right order. - ops_ = SortOpLikeDescOrder(*g); - - // 1. track the nodes which reused previous node in Python memory optimize. - // these node can not be inplaced, otherwise may generate a circle in graph. - std::unordered_set all_vars; - for (auto& node : g->Nodes()) { - if (node->IsVar()) continue; - for (auto& out : node->outputs) { - if (out->IsCtrlVar() || out->Var() == nullptr) continue; - if (all_vars.count(out->Name())) { - dup_nodes_.emplace(out->Name()); - } else { - all_vars.emplace(out->Name()); - } - } - } + TopoSort(g); // 2. track the nodes which used by parameter server. // these node can not be inplaced, otherwise trainer diff --git a/paddle/fluid/framework/details/inplace_op_pass.h b/paddle/fluid/framework/details/inplace_op_pass.h index 7be7f311852d2b64ce95e1a939371760d03d296b..01964ba8fc43fa86bb99c185fa20b056fddbffee 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.h +++ b/paddle/fluid/framework/details/inplace_op_pass.h @@ -14,6 +14,7 @@ #pragma once #include +#include #include #include #include @@ -50,10 +51,15 @@ class GraphView { // map the parameter and gradient, must be skipped. bool InSkipSet(const std::string& var) const; + bool CheckDeps(ir::Node* var, ir::Node* current_op) const; + bool CheckOpDeps(ir::Node* op1, ir::Node* op2) const; + void TopoSort(ir::Graph* g); + private: std::vector ops_; std::unordered_set dup_nodes_; // mem opt affect nodes std::map> adj_list_; + std::unordered_map op_level_; }; // swap pairs in sequence diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc index c89a33fc959247afb74dab49056fc3fca8b9bd89..894d7dad2e623649fe96b00bb515c9605c89a404 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper.cc @@ -190,7 +190,7 @@ struct NodeComparator { auto rhs_shape = rhs_desc->GetShape(); if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || (lhs_shape[0] != -1 && rhs_shape[0] != -1)) { - return NodeSize(lhs) <= NodeSize(rhs); + return NodeSize(lhs) == NodeSize(rhs); } else { return false; } @@ -337,7 +337,6 @@ bool NodeCanReused(const VarDesc& node) { auto type = node.GetType(); // only these types holds bulk of gpu memory if (!(type == proto::VarType::LOD_TENSOR || - type == proto::VarType::SELECTED_ROWS || type == proto::VarType::LOD_TENSOR_ARRAY)) { return false; } @@ -450,6 +449,7 @@ void ControlFlowGraph::LiveVariableAnalysis() { live_in_[op].insert(var); } for (auto& var : defs_[op]) { + if (uses_[op].count(var)) continue; live_in_[op].erase(var); } diff --git a/paddle/fluid/framework/details/memory_optimize_helper_test.cc b/paddle/fluid/framework/details/memory_optimize_helper_test.cc index 5389e76e0c65c7c0ee23004ca1b0a56efb4c54fe..453943af0f123a08b870f11dacb78a5fbd954a56 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc @@ -142,15 +142,16 @@ TEST(OrderedSet, FindBestFitNode) { for (auto& node : nodes) { pool.Insert(node.get()); } - + // FIXME(liuwei1031) this API has changed, + // disable these tests temporarily // FindNextBestFitNode - auto* n = nodes[0].get(); - auto* cache = pool.FindBestFitNode(n); - PADDLE_ENFORCE(cache->Name() == "a"); - cache = pool.FindNextBestFitNode(n, cache); - PADDLE_ENFORCE(cache->Name() == "c"); - cache = pool.FindNextBestFitNode(n, cache); - PADDLE_ENFORCE(cache->Name() == "b"); + // auto* n = nodes[0].get(); + // auto* cache = pool.FindBestFitNode(n); + // PADDLE_ENFORCE(cache->Name() == "a"); + // cache = pool.FindNextBestFitNode(n, cache); + // PADDLE_ENFORCE(cache->Name() == "c"); + // cache = pool.FindNextBestFitNode(n, cache); + // PADDLE_ENFORCE(cache->Name() == "b"); } } // namespace details diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc index e7284ea64438557161a0c97a6a7f45fb9bb245ca..80720af32d5670928a6ad2b9efbeadf6452b0273 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.cc +++ b/paddle/fluid/framework/details/memory_optimize_pass.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "gflags/gflags.h" #include "paddle/fluid/framework/data_type.h" @@ -191,6 +192,10 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { // immediately to make the subblock variable reuse strategy take // effect. Because it is a single op in graph. No need to // update the ir nodes. + // FIXME(liuwei1031): Graph is not aware of the existence of + // BlockDescs and ProgramDescs. + // The operations related to BlockDesc or ProgramDesc should perform + // on Graph or Node directly! sub_op_desc->Rename(var->Name(), cache->Name()); if (sub_op_desc->Block() != nullptr && sub_op_desc->Block()->HasVar(var->Name())) { diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 478d2ffbcf2988487893984284d4597f018f0ca0..8c61684c9c9643476bf2fe780172c9c34a2e1986 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -11,18 +11,20 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include #include +#include #include +#include +#include #include #include - #include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h" -#include "paddle/fluid/framework/details/data_balance_op_handle.h" +#include "paddle/fluid/framework/details/fetch_barrier_op_handle.h" #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" -#include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/rpc_op_handle.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" @@ -30,6 +32,7 @@ #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/math/math_function.h" namespace paddle { namespace framework { @@ -134,21 +137,26 @@ void AddOutputToLeafOps(ir::Graph *graph) { } } // namespace +void MultiDevSSAGraphBuilderBase::CheckGraph(const ir::Graph &graph) const {} + void MultiDevSSAGraphBuilderBase::Init() const { all_vars_.clear(); loss_var_name_ = Get(kLossVarName); + VLOG(10) << "Init MultiDevSSAGraphBuilder, loss name: " << loss_var_name_; places_ = Get>(kPlaces); local_scopes_ = Get>(kLocalScopes); strategy_ = Get(kStrategy); #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - nccl_ctxs_ = &Get("nccl_ctxs"); + nccl_ctxs_ = &Get(kNCCLCtxs); #endif + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); } std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( std::unique_ptr graph) const { Init(); + CheckGraph(*graph); std::vector sorted_ops = SortOperations(*graph); auto nodes = graph->ReleaseNodes(); @@ -166,7 +174,6 @@ std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( result.Set(kGraphOps, new GraphOps); bool is_forwarding = true; - bool insert_collection_ops = NeedCollectiveOps(); for (ir::Node *node : sorted_ops) { if (DealWithSpecialOp(&result, node)) { @@ -185,8 +192,8 @@ std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } - // Insert collection ops - if (!is_forwarding && insert_collection_ops) { + // Insert collective ops if nranks > 1 + if (!is_forwarding && Get(kNRanks) > 1) { try { bool is_bk_op = static_cast(boost::get(node->Op()->GetAttr( @@ -200,13 +207,14 @@ std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( boost::get>(node->Op()->GetNullableAttr( OpProtoAndCheckerMaker::OpRoleVarAttrName())); PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); - for (size_t i = 0; i < backward_vars.size(); i += 2) { auto &p_name = backward_vars[i]; auto &g_name = backward_vars[i + 1]; - VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; - - InsertCollectiveOp(&result, p_name, g_name); + VLOG(10) << "Bcast " << g_name << " for parameter " << p_name + << " op_type " << node->Op()->Type(); + if (NeedCollectiveForGrad(g_name, sorted_ops)) { + InsertCollectiveOp(&result, p_name, g_name); + } } } catch (boost::bad_get e) { } @@ -226,6 +234,7 @@ std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( * Only variables should be the leaves of graph. */ AddOutputToLeafOps(&result); + result.Erase(kGraphOps); return graph; } @@ -258,6 +267,11 @@ void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp( } } +bool MultiDevSSAGraphBuilderBase::DealWithSpecialOp(ir::Graph *result, + ir::Node *node) const { + return false; +} + std::vector MultiDevSSAGraphBuilderBase::SortOperations( const ir::Graph &graph) const { return ir::TopologySortOperations(graph); @@ -271,8 +285,20 @@ bool MultiDevSSAGraphBuilderBase::UseGPU() const { return use_gpu; } -bool MultiDevSSAGraphBuilderBase::NeedCollectiveOps() const { - return Get(kNRanks) > 1; +bool MultiDevSSAGraphBuilderBase::NeedCollectiveForGrad( + const std::string &grad_name, std::vector ops) const { + // if we have allreduce_op for current gradient variable in the graph, + // then we don't need to add allreduce_op_handle for this gradient + // NOTE: This is for the case that all gradients should add collective ops + for (auto *node : ops) { + if (node->Op()->Type() != "allreduce") continue; + for (auto in_name : node->Op()->InputArgumentNames()) { + if (in_name == grad_name) { + return false; + } + } + } + return true; } void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result, @@ -390,8 +416,9 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result, CreateOpHandleIOs(result, node, dev_id); } -void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( - ir::Graph *result, const std::string &og) const { +void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result, + const std::string &og, + bool is_encoded) const { OpHandleBase *op_handle = nullptr; auto append_allreduce_op = [&]( @@ -400,7 +427,9 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), - scopes, places, nccl_ctxs_)); + scopes, places, nccl_ctxs_, is_encoded, + static_cast(strategy_.trainers_endpoints_.size()) * + places_.size())); #else result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), @@ -422,12 +451,15 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); op_handle->AddInput(prev_grad); + VLOG(10) << "all_reduce_op_handle add input " << prev_grad->DebugString(); auto var = new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable), vars.size(), i, og, places_[i]); vars.emplace_back(var); op_handle->AddOutput(var); + VLOG(10) << "all_reduce_op_handle add output " << og + << ", handle:" << var->DebugString(); } } @@ -496,20 +528,17 @@ VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result, } bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const { - return boost::get( + return !loss_var_name_.empty() && node->Op() && + boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == (static_cast(OpRole::kBackward) | - static_cast(OpRole::kLoss)) && - !loss_var_name_.empty(); // If loss_var is empty. This is test mode + static_cast(OpRole::kLoss)); } bool MultiDevSSAGraphBuilderBase::IsSparseGradient( const std::string &og) const { PADDLE_ENFORCE(all_vars_.count(og) != 0); - if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { - return true; - } - return false; + return all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS; } void AllReduceSSAGraphBuilder::InsertCollectiveOp( @@ -831,9 +860,17 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const { PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s", node->Op()->Type()); - result->Get(kGraphOps).emplace_back(new RPCOpHandle( - result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id], - node->Op()->Type(), places_[op_dev_id])); + + // Create fetch_barrier op handle to enable output on all devices. + // **NOTE** fetch_barrier should output variables list same as recv op does. + if (node->Op()->Type() == "fetch_barrier") { + result->Get(kGraphOps).emplace_back(new FetchBarrierOpHandle( + result->CreateOpNode(node->Op()), local_scopes_, places_)); + } else { + result->Get(kGraphOps).emplace_back(new RPCOpHandle( + result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id], + node->Op()->Type(), places_[op_dev_id])); + } if (node->Op()->Type() == "send") { CreateOpHandleIOs(result, node, op_dev_id); @@ -912,6 +949,17 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, return op_dev_id; } +bool DistSSAGraphBuilder::IsEncoded(const std::string &p_name) const { + auto u_name = p_name + "__dgc_u__"; + auto it = all_vars_.find(u_name); + if (it == all_vars_.end()) { + VLOG(10) << "can't find u_name, so it's not encoded:" << u_name; + return false; + } + + return true; +} + void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, const std::string &p_name, const std::string &g_name) const { @@ -927,7 +975,11 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, CreateReduceOp(result, g_name, 0); CreateBroadcastOp(result, g_name, 0); } else { - CreateAllReduceOp(result, g_name); +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + CreateAllReduceOp(result, g_name, IsEncoded(p_name)); +#else + PADDLE_ENFORCE(false, "Compiled withoud cuda!"); +#endif } break; default: @@ -995,7 +1047,7 @@ static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) { REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass, paddle::framework::details::ReduceSSAGraphBuilder); REGISTER_MULTI_DEVICES_PASS( - allreduce_mode_multi_devices_pass, + all_reduce_mode_multi_devices_pass, paddle::framework::details::AllReduceSSAGraphBuilder); REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass, paddle::framework::details::DistSSAGraphBuilder); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 6d4386538ea7d0cc318647c92282af9d598fa699..8bfd7b9bf8f96c074996486e70184ca56c6f7167 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -14,7 +14,10 @@ #pragma once +#include #include +#include +#include #include #include @@ -31,12 +34,6 @@ namespace framework { class Scope; namespace details { -constexpr char kLossVarName[] = "loss_var_name"; -constexpr char kPlaces[] = "places"; -constexpr char kLocalScopes[] = "local_scopes"; -constexpr char kStrategy[] = "strategy"; -constexpr char kNRanks[] = "nranks"; - class MultiDevSSAGraphBuilderBase : public ir::Pass { protected: std::unique_ptr ApplyImpl( @@ -44,18 +41,21 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass { virtual void Init() const; + virtual void CheckGraph(const ir::Graph &graph) const; + virtual std::vector SortOperations(const ir::Graph &graph) const; virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, const std::string &g_name) const = 0; - virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0; + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const; virtual void InsertPostprocessOps(ir::Graph *result) const = 0; bool UseGPU() const; - bool NeedCollectiveOps() const; + bool NeedCollectiveForGrad(const std::string &grad_name, + std::vector ops) const; bool IsScaleLossOp(ir::Node *node) const; @@ -75,7 +75,8 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass { bool IsSparseGradient(const std::string &og) const; - void CreateAllReduceOp(ir::Graph *result, const std::string &og) const; + void CreateAllReduceOp(ir::Graph *result, const std::string &og, + bool is_encoded = false) const; void CreateBroadcastOp(ir::Graph *result, const std::string &p_name, size_t src_dev_id) const; @@ -109,10 +110,6 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, const std::string &g_name) const; - virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const { - return false; - } - virtual void InsertPostprocessOps(ir::Graph *result) const {} }; @@ -175,6 +172,8 @@ class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder { mutable std::vector> bcast_var_name_set_; mutable bool need_broadcast_var_{false}; + + bool IsEncoded(const std::string &p_name) const; }; std::unordered_set &MultiDevSSAGraphBuilder(); diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h index 9afbb91005c9c3a9d2e185f4dfa901ebf812ee19..ab5e099023382c4e28a9613d321ea8dc182d3534 100644 --- a/paddle/fluid/framework/details/multi_devices_helper.h +++ b/paddle/fluid/framework/details/multi_devices_helper.h @@ -16,6 +16,9 @@ #include #include +#include +#include +#include #include #include "paddle/fluid/framework/details/op_handle_base.h" @@ -44,6 +47,26 @@ const char kGraphVars[] = "vars"; typedef std::unordered_set GraphDepVars; const char kGraphDepVars[] = "dep_vars"; +constexpr char kNCCLCtxs[] = "nccl_ctxs"; + +constexpr char kLossVarName[] = "loss_var_name"; +constexpr char kPlaces[] = "places"; +constexpr char kLocalScopes[] = "local_scopes"; +constexpr char kStrategy[] = "strategy"; +constexpr char kNRanks[] = "nranks"; + +typedef std::unordered_set FusedVars; +constexpr char kFusedVars[] = "fused_vars"; + +typedef std::vector> ParamsAndGrads; +constexpr char kParamsAndGrads[] = "params_grads"; + +typedef std::vector>> + GroupGradsAndParams; +constexpr char kGroupGradsAndParams[] = "group_grads_params"; + +constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@"; + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 4822627ac3b65972f41d9a23d9fe3dba3de3f97d..413b14961631b3459e0d05af685ad1c5395844c2 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/details/op_handle_base.h" #include +#include namespace paddle { namespace framework { @@ -41,15 +42,42 @@ OpHandleBase::~OpHandleBase() { void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA - if (events_.empty() && use_cuda) { + if (events_.empty() && use_cuda && dev_ctxes_.size() > 0) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; PADDLE_ENFORCE(cudaSetDevice(dev_id)); PADDLE_ENFORCE( cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); } + if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) { + for (auto &out_var : outputs_) { + auto *out_var_handle = dynamic_cast(out_var); + if (out_var_handle) { + int dev_id = + boost::get(out_var_handle->place()).device; + out_var_handle->SetGenerateEvent(events_.at(dev_id)); + } + } + } else { + PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL, + "%s should have only one dev_ctx.", Name()); + auto &place = dev_ctxes_.begin()->first; + int dev_id = boost::get(place).device; + for (auto &out_var : outputs_) { + auto *out_var_handle = dynamic_cast(out_var); + if (out_var_handle) { + PADDLE_ENFORCE( + platform::is_same_place(place, out_var_handle->place()), + "The place of input(%s) is not consistent with the " + "place of current op(%s).", + out_var_handle->Name(), Name()); + out_var_handle->SetGenerateEvent(events_.at(dev_id)); + } + } + } } #else + PADDLE_ENFORCE(!use_cuda); #endif @@ -93,17 +121,48 @@ void OpHandleBase::AddOutput(VarHandleBase *out) { void OpHandleBase::WaitInputVarGenerated() { for (auto in_var : inputs_) { if (NeedWait(in_var)) { - for (auto &pair : dev_ctxes_) { - in_var->GeneratedOp()->RecordWaitEventOnCtx(pair.second); + // Dummy Variable is used to represent dependencies between operators, so + // there doesn't add event for it. + auto *in_var_handle = dynamic_cast(in_var); + if (in_var_handle) { + auto &place = in_var_handle->place(); + if (platform::is_gpu_place(place)) { +#ifdef PADDLE_WITH_CUDA + auto stream = + static_cast(dev_ctxes_.at(place)) + ->stream(); + PADDLE_ENFORCE( + cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); +#else + PADDLE_THROW("Doesn't compile the GPU."); +#endif + } + // There are nothing to do when the place is CPUPlace. } } } } void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { - for (auto *in : inputs_) { - if (NeedWait(in)) { - in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place)); + for (auto in_var : inputs_) { + if (NeedWait(in_var)) { + // Dummy Variable is used to represent dependencies between operators, so + // there doesn't add event for it. + auto *in_var_handle = dynamic_cast(in_var); + if (in_var_handle) { + if (platform::is_gpu_place(in_var_handle->place())) { +#ifdef PADDLE_WITH_CUDA + auto stream = static_cast( + dev_ctxes_.at(in_var_handle->place())) + ->stream(); + PADDLE_ENFORCE( + cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); +#else + PADDLE_THROW("Doesn't compile the GPU."); +#endif + } + // There are nothing to do when the place is CPUPlace. + } } } } diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h index 0901e59f9786b43361e7a570f8c2a07be54c1ac2..e5b58ec68761469a03929435d1a73bf0a2d1660e 100644 --- a/paddle/fluid/framework/details/op_registry.h +++ b/paddle/fluid/framework/details/op_registry.h @@ -16,9 +16,12 @@ limitations under the License. */ #include #include +#include +#include #include #include "paddle/fluid/framework/grad_op_desc_maker.h" #include "paddle/fluid/framework/inplace_op_inference.h" +#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" @@ -34,27 +37,86 @@ enum OpInfoFillType { kGradOpDescMaker = 2, kVarTypeInference = 3, kShapeInference = 4, - kInplaceOpInference = 5 + kInplaceOpInference = 5, + kNoNeedBufferVarsInference = 6, + kUnknown = -1 }; +namespace internal { +template +struct TypePair { + using Type = T; + static constexpr OpInfoFillType kFillType = kType; +}; + +using OpRegistryClasses = std::tuple< // NOLINT + TypePair, // NOLINT + TypePair, // NOLINT + TypePair, // NOLINT + TypePair, // NOLINT + TypePair, // NOLINT + TypePair, // NOLINT + TypePair // NOLINT + >; + +static constexpr int kOpRegistryClassNumber = + std::tuple_size::value; + +template +struct IsMatchedBaseTypeImpl { + using PairType = typename std::tuple_element::type; + static constexpr bool kValue = + std::is_base_of::value; +}; + +template +struct IsMatchedBaseTypeImpl { + static constexpr bool kValue = false; +}; + +template +static inline constexpr bool IsMatchedBaseType() { + return IsMatchedBaseTypeImpl< + T, kPos, (kPos >= 0 && kPos < kOpRegistryClassNumber)>::kValue; +} + +template +struct OpInfoFillTypeGetterImpl {}; + +// This case should not happen +template +struct OpInfoFillTypeGetterImpl {}; + +template +struct OpInfoFillTypeGetterImpl { + static constexpr OpInfoFillType kType = kUnknown; +}; + +template +struct OpInfoFillTypeGetterImpl { + static constexpr OpInfoFillType kType = + OpInfoFillTypeGetterImpl()>::kType; +}; + +template +struct OpInfoFillTypeGetterImpl { + using PairType = typename std::tuple_element::type; + static constexpr OpInfoFillType kType = PairType::kFillType; +}; + +template +using OpInfoFillTypeGetter = + OpInfoFillTypeGetterImpl()>; + +} // namespace internal + template struct OpInfoFillTypeID { static constexpr OpInfoFillType ID() { - return std::is_base_of::value - ? kOperator - : (std::is_base_of::value - ? kOpProtoAndCheckerMaker - : (std::is_base_of::value - ? kGradOpDescMaker - : (std::is_base_of::value - ? kVarTypeInference - : (std::is_base_of::value - ? kShapeInference - : (std::is_base_of< - InplaceOpInference, T>::value - ? kInplaceOpInference - : static_cast( - -1)))))); + return internal::OpInfoFillTypeGetter::kType; } }; @@ -127,9 +189,9 @@ struct OpInfoFiller { template struct OpInfoFiller { void operator()(const char* op_type, OpInfo* info) const { - info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) { + info->infer_var_type_ = [](InferVarTypeContext* context) { T inference; - inference(fwd_op, block); + inference(context); }; } }; @@ -147,9 +209,21 @@ struct OpInfoFiller { template struct OpInfoFiller { void operator()(const char* op_type, OpInfo* info) const { - info->infer_inplace_ = [](const OpDesc& op_desc, BlockDesc* block) { + info->infer_inplace_ = [](const OpDesc& op_desc) { T infer; - return infer(op_desc, block); + return infer(op_desc); + }; + } +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->infer_no_need_buffer_vars_ = [](const VariableNameMap& inputs, + const VariableNameMap& outputs, + const AttributeMap& attrs) { + T infer(inputs, outputs, attrs); + return infer(); }; } }; diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h index 2e5256fbd49a3f8c72840cd55dada4301cb04eb9..0de8e436518ea353a185087b0e4668b5d200c966 100644 --- a/paddle/fluid/framework/details/reduce_and_gather.h +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -53,6 +53,31 @@ struct ReduceLoDTensor { } }; +struct ReduceBufferData { + const std::vector &src_data_; + void *dst_data_; + int64_t numel_; + + ReduceBufferData(const std::vector &src, void *dst, + int64_t numel) + : src_data_(src), dst_data_(dst), numel_(numel) {} + + template + void apply() const { + T *dst_data = reinterpret_cast(dst_data_); + for (size_t i = 0; i < src_data_.size(); ++i) { + auto srd_data = reinterpret_cast(src_data_[i]); + VLOG(10) << "dst: " << dst_data_ << ", " << srd_data; + if (srd_data == dst_data_) { + continue; + } + + std::transform(srd_data, srd_data + numel_, dst_data, dst_data, + [](T a, T b) -> T { return a + b; }); + } + } +}; + inline void GatherLocalSelectedRows( const std::vector &src_selecte_rows_, const std::vector &in_places, diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 13a042d8e6ed7f18c76387b666d681df0eabd0b5..c218e55b70dc7c5e140a8611b3b8310bc43b2c0e 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -12,9 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include +#include +#include +#include #include #include "paddle/fluid/framework/details/computation_op_handle.h" @@ -189,13 +193,77 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx, return shrink_func(computation_op); } -static VarDesc *TryGetLatestVarDesc(const std::vector &vars) { - VarDesc *var_desc = nullptr; - std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool { - var_desc = var_handle->Node()->Var(); - return var_desc != nullptr; - }); - return var_desc; +/** + * Shrink op dependencies according to no need buffer vars. + * + * If some ops do not need Tensor buffer of any input, + * just remove the dependency of this op, i.e, decrease reference count. + * + * For example, input Y of elementwise_add_grad op is only used to infer shape + * and lod of Y@GRAD, we do not need the buffer of input Y. Data buffer of + * input Y can be collected before elementwise_add_grad op runs. + * + * This method returns whether the dependency count decreases to 0, and + * shrinks op dependency if possible. + */ +static bool ShrinkNoNeedBufferVarOpDependency( + const std::string &var_name, + std::unordered_set *op_handles) { + std::vector skip_ops; + for (auto *op_handle : *op_handles) { + auto *op_base = op_handle->GetOp(); + auto &inferer = op_base->Info().NoNeedBufferVarsInferer(); + if (!inferer) { + continue; + } + + std::unordered_set no_need_buffer_vars = + inferer(op_base->Inputs(), op_base->Outputs(), op_base->Attrs()); + + // Check whether var_name occurs in other inputs or outputs of the op + // If it occurs, we cannot decrease the dependency number. + bool occurred_in_other_vars = false; + for (auto &in_pair : op_base->Inputs()) { + if (no_need_buffer_vars.count(in_pair.first) > 0) { + continue; + } + + auto &args = in_pair.second; + auto iter = std::find(args.begin(), args.end(), var_name); + if (iter != args.end()) { + occurred_in_other_vars = true; + break; + } + } + + if (occurred_in_other_vars) { + continue; + } + + for (auto &out_pair : op_base->Outputs()) { + auto &args = out_pair.second; + auto iter = std::find(args.begin(), args.end(), var_name); + if (iter != args.end()) { + occurred_in_other_vars = true; + break; + } + } + + if (!occurred_in_other_vars) { + VLOG(2) << "Shrink var " << var_name << " in op " << op_handle->Name(); + skip_ops.emplace_back(op_handle); + } + } + + if (skip_ops.size() == op_handles->size()) { + op_handles->clear(); + return true; + } else { + for (auto *skip_op : skip_ops) { + op_handles->erase(skip_op); + } + return false; + } } std::unique_ptr ReferenceCountPass::ApplyImpl( @@ -234,17 +302,44 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( continue; } - bool ok; - auto result = ExtractComputationOpFromLastLivedVar( - name_var_pair.second.back(), i, shrink_func, &ok); + auto &var_name = name_var_pair.first; + auto &var_handles = name_var_pair.second; + + for (auto iter = var_handles.rbegin(); iter != var_handles.rend(); + ++iter) { + bool ok; + auto result = + ExtractComputationOpFromLastLivedVar(*iter, i, shrink_func, &ok); + + // Seldomly, some vars may have no pending or preceding computation ops + // Just break; + if (!ok) break; + VLOG(10) << "Extract " << result.size() << " ops of var " << var_name; + + size_t original_op_deps = result.size(); + // If all ops do not need buffer of var_name, calculate reference count + // of the previous version of var_name. + if (ShrinkNoNeedBufferVarOpDependency(var_name, &result)) { + VLOG(10) << "Try to precede reference count computing at var " + << var_name; + continue; + } + + size_t final_op_deps = result.size(); + if (final_op_deps < original_op_deps) { + VLOG(5) << "Shrink op deps from " << original_op_deps << " to " + << final_op_deps; + } - if (ok) { - auto &var_name = name_var_pair.first; PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty", var_name); ref_cnts[i].emplace(var_name, result.size()); last_live_ops_of_vars[i].emplace(var_name, std::move(result)); + break; } + + // Seldomly, all preceding trying failed. + // Just skip this corner case } } diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.cc b/paddle/fluid/framework/details/reference_count_pass_helper.cc index 89bd08c2d041d795205b29bb29aba311d1dbd932..94de0e6ab0a91d90a7f2c4c4fc14eb78663c95fe 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.cc +++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc @@ -13,9 +13,22 @@ // limitations under the License. #include "paddle/fluid/framework/details/reference_count_pass_helper.h" +#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/var_desc.h" namespace paddle { namespace framework { -namespace details {} // namespace details +namespace details { + +VarDesc *TryGetLatestVarDesc(const std::vector &vars) { + VarDesc *var_desc = nullptr; + std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool { + var_desc = var_handle->Node()->Var(); + return var_desc != nullptr; + }); + return var_desc; +} + +} // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h index 1c083dbf001b08e40a54cc89b21c3dea1f18f16a..ce700119c54ddd711315dfa45d61b9241cfda651 100644 --- a/paddle/fluid/framework/details/reference_count_pass_helper.h +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -25,6 +26,10 @@ namespace paddle { namespace framework { + +class VarDesc; +class VarHandle; + namespace details { class ComputationOpHandle; @@ -43,9 +48,11 @@ const char kGarbageCollector[] = "garbage_collector"; const char kAllPlaces[] = "all_places"; using LastLiveOpsOfVars = - std::unordered_map>; + std::unordered_map>; const char kLastLiveOpsOfVars[] = "last_live_ops_of_var"; +VarDesc *TryGetLatestVarDesc(const std::vector &vars); + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 9ba295a2b06a5ee9c3069e95fa688595fe72d6fd..c4254bbadfa17682f437f46f02adc9c884d24304 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -14,7 +14,6 @@ #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" -#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/platform/profiler.h" @@ -27,62 +26,49 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( : graph_(graph), pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) : nullptr), + prepare_pool_(1), local_scopes_(local_scopes), places_(places), fetch_ctxs_(places), - running_ops_(0), - strategy_(strategy) {} + strategy_(strategy) { + PrepareOpDeps(); + CopyOpDeps(); +} FeedFetchList ThreadedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { std::unique_ptr event( new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare")); - std::unordered_map pending_ops; - std::unordered_set pending_vars; - auto ready_vars = std::make_shared>(); - std::unordered_set ready_ops; + std::unique_ptr op_deps = op_deps_futures_.get(); + CopyOpDeps(); + VLOG(10) << "ThreadedSSAGraphExecutor::Run"; + std::shared_ptr> ready_vars( + new BlockingQueue); + auto &pending_ops = op_deps->pending_ops_; + auto &pending_vars = op_deps->pending_vars_; + auto &ready_ops = op_deps->ready_ops_; + // For ops (e.g. nccl_all_reduce) that need to coordinate multiple // streams from multiple GPUs, it's faster to buffer them and schedule // together since we currently cannot overlap computation and memcpy streams. // Should revisit it if overlapping is available. std::unordered_set delayed_ops; - // Transform SSAGraph to pending_ops & pending_vars - for (auto &var_map : graph_->Get(details::kGraphVars)) { - for (auto &name_pair : var_map) { - for (auto &version_pair : name_pair.second) { - InsertPendingVar(&pending_vars, ready_vars.get(), version_pair); - } - } - } - for (auto &var : graph_->Get(details::kGraphDepVars)) { - InsertPendingVar(&pending_vars, ready_vars.get(), var); - } - - for (auto &op : ir::FilterByNodeWrapper(*graph_)) { - if (op->Inputs().empty()) { // Special case, Op has no input. - ready_ops.insert(op); - } else { - InsertPendingOp(&pending_ops, op); - } - } - // Step 2. Insert FetchOps std::vector fetch_ops; std::unordered_set fetch_dependencies; FeedFetchList fetch_data(fetch_tensors.size()); - InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops, - &pending_vars, ready_vars.get(), &fetch_data); + InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &ready_ops, + &pending_ops, &pending_vars, &fetch_data); auto run_all_ops = [&](std::unordered_set &set) { for (auto *op : set) { - running_ops_++; RunOp(ready_vars, op); } set.clear(); }; - + auto run_all_op = [&](OpHandleBase *op) { RunOp(ready_vars, op); }; // Clean run context run_op_futures_.clear(); exception_holder_.Clear(); @@ -91,19 +77,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( while (!pending_vars.empty()) { // 1. Run All Ready ops // Keep loop until all vars are ready. - // - // NOTE: DelayedOps have a lower priority. It will be scheduled after all - // ready_ops have been performed. - if (ready_ops.empty() && strategy_.allow_op_delay_ && running_ops_ == 0) { - run_all_ops(delayed_ops); - } else { - run_all_ops(ready_ops); - } + run_all_ops(ready_ops); // 2. Find ready variable bool timeout; auto cur_ready_vars = ready_vars->PopAll(1, &timeout); - if (timeout) { if (exception_holder_.IsCaught()) { for (auto &run_op_future : run_op_futures_) { @@ -115,6 +93,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( continue; } } + // 3. Remove the dependency of ready_var. // Find the ready_ops after the ready_var. for (auto ready_var : cur_ready_vars) { @@ -123,11 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto &deps = pending_ops[op]; --deps; if (deps == 0) { - if (op->IsMultiDeviceTransfer() && strategy_.allow_op_delay_) { - delayed_ops.insert(op); - } else { - ready_ops.insert(op); - } + run_all_op(op); } } } @@ -143,16 +118,17 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( const std::vector &fetch_tensors, std::vector *fetch_ops, std::unordered_set *fetch_dependencies, + std::unordered_set *ready_ops, std::unordered_map *pending_ops, std::unordered_set *pending_vars, - BlockingQueue *ready_vars, FeedFetchList *fetch_data) { + FeedFetchList *fetch_data) { std::unordered_map> fetched_vars; - + std::unordered_set local_ready_vars; for (auto &fetch_var_name : fetch_tensors) { for (auto &var_map : graph_->Get(details::kGraphVars)) { auto it = var_map.find(fetch_var_name); if (it != var_map.end()) { - fetched_vars[fetch_var_name].push_back(*it->second.rbegin()); + fetched_vars[fetch_var_name].emplace_back(*it->second.rbegin()); } } } @@ -161,8 +137,9 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( auto &var_name = fetch_tensors[i]; auto fetched_var_it = fetched_vars.find(var_name); PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(), - "Cannot find fetched variable.(Perhaps the main_program " - "is not set to ParallelExecutor)"); + "Cannot find fetched variable(%s).(Perhaps the main_program " + "is not set to ParallelExecutor)", + var_name); auto &vars = fetched_var_it->second; @@ -184,9 +161,23 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( auto *fetch_dummy = new DummyVarHandle(fetch_var); op->AddOutput(fetch_dummy); fetch_dependencies->emplace(fetch_dummy); - this->InsertPendingVar(pending_vars, ready_vars, fetch_dummy); - this->InsertPendingOp(pending_ops, op); + + this->InsertPendingVar(pending_vars, &local_ready_vars, fetch_dummy); + + size_t wait_input_num = 0; + std::unordered_set input_set(vars.begin(), vars.end()); + for (auto *var : input_set) { + if (pending_vars->count(var)) { + ++wait_input_num; + } + } + if (wait_input_num) { + pending_ops->insert({op, wait_input_num}); + } else { + ready_ops->insert(static_cast(op)); + } } + PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0); } void ThreadedSSAGraphExecutor::InsertPendingOp( @@ -197,11 +188,63 @@ void ThreadedSSAGraphExecutor::InsertPendingOp( void ThreadedSSAGraphExecutor::InsertPendingVar( std::unordered_set *pending_vars, - BlockingQueue *ready_vars, VarHandleBase *var) const { + std::unordered_set *ready_vars, VarHandleBase *var) const { pending_vars->insert(var); if (var->GeneratedOp() == nullptr) { - ready_vars->Push(var); + ready_vars->insert(var); + } +} + +void ThreadedSSAGraphExecutor::PrepareOpDeps() { + op_deps_.reset(new OpDependentData()); + std::unordered_map &pending_ops = + op_deps_->pending_ops_; + std::unordered_set &pending_vars = op_deps_->pending_vars_; + std::unordered_set &ready_ops = op_deps_->ready_ops_; + std::unordered_set ready_vars; + + // Transform SSAGraph to pending_ops & pending_vars + for (auto &var_map : graph_->Get(details::kGraphVars)) { + for (auto &name_pair : var_map) { + for (auto &version_pair : name_pair.second) { + InsertPendingVar(&pending_vars, &ready_vars, version_pair); + } + } + } + for (auto &var : graph_->Get(details::kGraphDepVars)) { + InsertPendingVar(&pending_vars, &ready_vars, var); + } + + for (auto &op : ir::FilterByNodeWrapper(*graph_)) { + if (op->Inputs().empty()) { // Special case, Op has no input. + ready_ops.insert(op); + } else { + InsertPendingOp(&pending_ops, op); + } } + for (auto ready_var : ready_vars) { + pending_vars.erase(ready_var); + for (auto *op : ready_var->PendingOps()) { + auto &deps = pending_ops[op]; + --deps; + if (deps == 0) { + ready_ops.insert(op); + } + } + } +} + +void ThreadedSSAGraphExecutor::CopyOpDeps() { + op_deps_futures_ = prepare_pool_.enqueue([&] { + auto *op_deps = new OpDependentData(); + op_deps->pending_ops_.insert(op_deps_->pending_ops_.begin(), + op_deps_->pending_ops_.end()); + op_deps->pending_vars_.insert(op_deps_->pending_vars_.begin(), + op_deps_->pending_vars_.end()); + op_deps->ready_ops_.insert(op_deps_->ready_ops_.begin(), + op_deps_->ready_ops_.end()); + return std::unique_ptr(op_deps); + }); } void ThreadedSSAGraphExecutor::RunOp( @@ -216,7 +259,6 @@ void ThreadedSSAGraphExecutor::RunOp( op->Run(strategy_.use_cuda_); } VLOG(10) << op << " " << op->Name() << " Done "; - running_ops_--; ready_var_q->Extend(op->Outputs()); VLOG(10) << op << " " << op->Name() << " Signal posted"; } catch (...) { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 0867f6210480ec405e7cc4ea42c74b750133ea4e..b9bccba8fa2fa13d99a9a39a5135106101daa903 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -15,18 +15,20 @@ #pragma once #include +#include #include +#include #include +#include #include #include #include - -#include #include "ThreadPool.h" // ThreadPool in thrird party #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h" #include "paddle/fluid/framework/ir/graph.h" @@ -36,6 +38,12 @@ class Scope; namespace details { +struct OpDependentData { + std::unordered_map pending_ops_; + std::unordered_set pending_vars_; + std::unordered_set ready_ops_; +}; + class ThreadedSSAGraphExecutor : public SSAGraphExecutor { public: ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, @@ -57,29 +65,35 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { private: ir::Graph *graph_; std::unique_ptr<::ThreadPool> pool_; + ::ThreadPool prepare_pool_; std::vector local_scopes_; std::vector places_; platform::DeviceContextPool fetch_ctxs_; ExceptionHolder exception_holder_; - std::atomic running_ops_; void InsertPendingOp(std::unordered_map *pending_ops, OpHandleBase *op_instance) const; void InsertPendingVar(std::unordered_set *pending_vars, - BlockingQueue *ready_vars, + std::unordered_set *ready_vars, VarHandleBase *var) const; void InsertFetchOps(const std::vector &fetch_tensors, std::vector *fetch_ops, std::unordered_set *fetch_dependencies, + std::unordered_set *ready_ops, std::unordered_map *pending_ops, std::unordered_set *pending_vars, - BlockingQueue *ready_vars, FeedFetchList *fetch_data); + void PrepareOpDeps(); + void CopyOpDeps(); + private: + std::future> op_deps_futures_; + ExecutionStrategy strategy_; + std::unique_ptr op_deps_; // use std::list because clear(), push_back, and for_each are O(1) std::list> run_op_futures_; }; diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc index 30da029ca2a90e7faa6288557ff2f1aeb21cc1c6..95d62e66415e7879144d35f858ef04a8a936cd66 100644 --- a/paddle/fluid/framework/details/var_handle.cc +++ b/paddle/fluid/framework/details/var_handle.cc @@ -24,7 +24,8 @@ VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); } std::string VarHandle::DebugString() const { std::stringstream ss; - ss << name_ << ":" << place_; + ss << "name:" << name_ << ", place:" << place_ << ", version:" << version_ + << ", scope_idx:" << scope_idx_; return ss.str(); } diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h index 8321c32f8b1d73bf5e6080b4b314abc9fd20536d..93060ef2593cbc032a382b617f9690e392a15b63 100644 --- a/paddle/fluid/framework/details/var_handle.h +++ b/paddle/fluid/framework/details/var_handle.h @@ -43,6 +43,7 @@ struct VarHandleBase { virtual ~VarHandleBase(); virtual std::string DebugString() const = 0; + virtual const std::string& Name() const = 0; void AddInput(OpHandleBase* in, ir::Node* node) { node_->inputs.clear(); @@ -95,8 +96,6 @@ struct VarHandleBase { // // NOTE: runtime variables have place. struct VarHandle : public VarHandleBase { - explicit VarHandle(ir::Node* node) : VarHandleBase(node) {} - virtual ~VarHandle(); std::string DebugString() const override; @@ -109,6 +108,20 @@ struct VarHandle : public VarHandleBase { name_(std::move(name)), place_(std::move(place)) {} +#ifdef PADDLE_WITH_CUDA + bool HasEvent() { return has_event_; } + + const cudaEvent_t& GetEvent() { + PADDLE_ENFORCE(HasEvent(), "The event is not set."); + return event_; + } + + void SetGenerateEvent(const cudaEvent_t& event) { + has_event_ = true; + event_ = event; + } +#endif + // version field currently is not used, however, just store the version to // debug easily. private: @@ -116,6 +129,11 @@ struct VarHandle : public VarHandleBase { size_t scope_idx_; std::string name_; platform::Place place_; +#ifdef PADDLE_WITH_CUDA + // Only when this event is triggered, var is generated. + cudaEvent_t event_; + bool has_event_{false}; +#endif public: bool IsTheSameVar(const VarHandle& o) const { @@ -125,6 +143,7 @@ struct VarHandle : public VarHandleBase { size_t version() const { return version_; } size_t scope_idx() const { return scope_idx_; } + const std::string& Name() const override { return name_; } const std::string& name() const { return name_; } const platform::Place& place() const { return place_; } }; @@ -136,6 +155,10 @@ struct DummyVarHandle : public VarHandleBase { virtual ~DummyVarHandle(); std::string DebugString() const override; + + public: + const std::string& Name() const override { return name_; } + std::string name_{"DummyVar"}; }; } // namespace details diff --git a/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..fd6b6dd2274d9721b8754e16cd7b4f1ab596380d --- /dev/null +++ b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +class WhileOpEagerDeletionPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override { + auto all_ops = ir::FilterByNodeWrapper(*graph); + + // Find all while_op and while_grad_op + std::unordered_map, + std::vector>> + target_ops; + for (auto *op : all_ops) { + auto compute_op = dynamic_cast(op); + if (compute_op == nullptr) continue; + + if (compute_op->Name() == "while") { + target_ops[compute_op->GetScopeIdx()].first.emplace_back( + compute_op->GetOp()); + } else if (compute_op->Name() == "while_grad") { + target_ops[compute_op->GetScopeIdx()].second.emplace_back( + compute_op->GetOp()); + } + } + + for (auto &ops_pair : target_ops) { + auto &while_ops = ops_pair.second.first; + auto &while_grad_ops = ops_pair.second.second; + operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + while_ops, while_grad_ops); + } + return graph; + } +}; + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(while_op_eager_deletion_pass, + paddle::framework::details::WhileOpEagerDeletionPass); diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index c31d0beec306fe165164837cd15c95b4efd76af0..0d4334f193dcb067a49f5e67b69d21531c7048bd 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -14,7 +14,12 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include +#include +#include +#include +#include +#include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor_array.h" @@ -23,17 +28,18 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_NGRAPH #include "paddle/fluid/operators/ngraph/ngraph_engine.h" +DEFINE_bool(use_ngraph, false, "Use NGRAPH to run"); #endif DECLARE_bool(benchmark); DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); -DEFINE_bool(use_ngraph, false, "Use NGRAPH to run"); namespace paddle { namespace framework { @@ -43,97 +49,23 @@ namespace { int kProgramId = -1; } // namespace -static std::unordered_map GetNonPersistableReferenceCounts( - const BlockDesc& block, const std::vector& skip_var_list) { - std::unordered_map ref_cnts; - std::unordered_set skip_vars(skip_var_list.begin(), - skip_var_list.end()); - - auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) { - for (auto& name_pair : name_map) { - for (auto& name : name_pair.second) { - if (skip_vars.count(name)) continue; - auto* var_desc = block.FindVar(name); - if (var_desc == nullptr || var_desc->Persistable()) continue; - auto type = var_desc->Proto()->type().type(); - if (type != proto::VarType::LOD_TENSOR && - type != proto::VarType::SELECTED_ROWS && - type != proto::VarType::LOD_TENSOR_ARRAY) { - continue; - } - ++ref_cnts[name]; - } - } - }; - - for (auto op_desc : block.AllOps()) { - update_ref_cnts(op_desc, op_desc->Inputs()); - update_ref_cnts(op_desc, op_desc->Outputs()); - } - return ref_cnts; -} - ExecutorPrepareContext::ExecutorPrepareContext( - const framework::ProgramDesc& prog, size_t block_id, - const std::vector& skip_ref_cnt_vars) - : prog_(prog), block_id_(block_id) { - if (GetEagerDeletionThreshold() >= 0) { - global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id), - skip_ref_cnt_vars); + const framework::ProgramDesc& prog, size_t block_id) + : prog_(prog), block_id_(block_id) {} + +void ExecutorPrepareContext::PrepareUnusedVars( + const std::vector& keep_vars, bool force_disable_gc) { + force_disable_gc_ = force_disable_gc; + if (GetEagerDeletionThreshold() < 0 || force_disable_gc_) { + return; } + unused_vars_ = GetUnusedVars(prog_.Block(block_id_), ops_, keep_vars); } ExecutorPrepareContext::~ExecutorPrepareContext() { VLOG(5) << "destroy ExecutorPrepareContext"; } -static void DeleteUnusedTensors( - const Scope& scope, const OperatorBase* op, GarbageCollector* gc, - std::unordered_map* ref_cnts) { - std::deque> garbages; - - auto handler = [&](const VariableNameMap& name_map) { - for (auto& name_pair : name_map) { - for (auto& name : name_pair.second) { - auto it = ref_cnts->find(name); - if (it == ref_cnts->end()) continue; - if (--(it->second) != 0) { - continue; - } - auto* var = scope.FindVar(name); - if (var == nullptr) { - continue; - } - - VLOG(2) << "Erase variable " << name; - if (var->IsType()) { - garbages.emplace_back( - var->GetMutable()->MoveMemoryHolder()); - } else if (var->IsType()) { - garbages.emplace_back(var->GetMutable() - ->mutable_value() - ->MoveMemoryHolder()); - } else if (var->IsType()) { - auto* lod_tensor_arr = var->GetMutable(); - for (auto& t : *lod_tensor_arr) { - garbages.emplace_back(t.MoveMemoryHolder()); - } - } else { - PADDLE_THROW("Type %s of %s is not supported eager deletion", - framework::ToTypeName(var->Type()), name); - } - } - } - }; - - handler(op->Inputs()); - handler(op->Outputs()); - - if (!garbages.empty()) { - gc->Add(std::move(garbages)); - } -} - Executor::Executor(const platform::Place& place) : place_(place) {} void Executor::Close() { @@ -184,13 +116,12 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, } void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, - bool create_local_scope, bool create_vars) { + bool create_local_scope, bool create_vars, + const std::vector& skip_ref_cnt_vars, + bool force_disable_gc) { platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); -#ifdef PADDLE_WITH_NGRAPH - if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc); -#endif - auto ctx = Prepare(pdesc, block_id); + auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); } @@ -357,20 +288,28 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, std::unique_ptr Executor::Prepare( const ProgramDesc& program, int block_id, - const std::vector& skip_ref_cnt_vars) { + const std::vector& skip_ref_cnt_vars, bool force_disable_gc) { std::unique_ptr ctx( - new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars)); + new ExecutorPrepareContext(program, block_id)); PADDLE_ENFORCE_LT(static_cast(block_id), program.Size()); auto& block = program.Block(block_id); for (auto& op_desc : block.AllOps()) { ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); } +#ifdef PADDLE_WITH_NGRAPH + if (FLAGS_use_ngraph) { + paddle::operators::NgraphEngine::FuseNgraphOps( + ctx->prog_.Block(ctx->block_id_), &ctx->ops_); + } +#endif + ctx->PrepareUnusedVars(skip_ref_cnt_vars, force_disable_gc); return ctx; } std::vector> Executor::Prepare( const ProgramDesc& program, const std::vector& block_ids, - const std::vector>& skip_ref_cnt_vars) { + const std::vector>& skip_ref_cnt_vars, + bool force_disable_gc) { PADDLE_ENFORCE( skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(), "skip_ref_cnt_vars should be either empty or equals to block number %d", @@ -378,17 +317,17 @@ std::vector> Executor::Prepare( std::vector> result; size_t idx = 0; for (auto& bid : block_ids) { - ExecutorPrepareContext* ctx; - if (skip_ref_cnt_vars.empty()) { - ctx = new ExecutorPrepareContext(program, bid); - } else { - ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]); - } PADDLE_ENFORCE_LT(static_cast(bid), program.Size()); + auto* ctx = new ExecutorPrepareContext(program, bid); auto& block = program.Block(bid); for (auto& op_desc : block.AllOps()) { ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); } + if (skip_ref_cnt_vars.empty()) { + ctx->PrepareUnusedVars(std::vector(), force_disable_gc); + } else { + ctx->PrepareUnusedVars(skip_ref_cnt_vars[idx], force_disable_gc); + } result.push_back(std::shared_ptr(ctx)); ++idx; } @@ -409,9 +348,9 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; - // skip while_op and while_grad_op temporarily - if (max_memory_size >= 0 && !keep_kids) { - ctx->ResetReferenceCount(); + // FIXME(zjl): recurrent_op is rather complex, we would + // disable gc forcely in recurrent_op + if (!ctx->force_disable_gc_ && max_memory_size >= 0) { #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { if (IsFastEagerDeletionModeEnabled()) { @@ -428,14 +367,18 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, #ifdef PADDLE_WITH_CUDA } #endif + // If gc is enabled and block size > 1 + if (gc && ctx->prog_.Size() > 1) { + operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_, + ctx->ops_); + } } for (auto& op : ctx->ops_) { op->Run(*local_scope, place_); if (gc) { - DeleteUnusedTensors(*local_scope, op.get(), gc.get(), - &(ctx->runtime_ref_cnts_)); + DeleteUnusedTensors(*local_scope, op.get(), ctx->unused_vars_, gc.get()); } } diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 5a040ac641588ad4d89d1f6e4c0d6c296eff38eb..825224437e0cdda03c56faf1b50833abd8b8c2ab 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -15,7 +15,9 @@ limitations under the License. */ #pragma once #include +#include #include +#include #include #include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/op_info.h" @@ -28,20 +30,20 @@ namespace paddle { namespace framework { struct ExecutorPrepareContext { - ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id, - const std::vector& skip_ref_cnt_vars = - std::vector()); + ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id); ~ExecutorPrepareContext(); - void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; } + void PrepareUnusedVars(const std::vector& keep_vars, + bool force_disable_gc = false); const framework::ProgramDesc& prog_; - size_t block_id_; + const size_t block_id_; + std::vector> ops_; - std::unordered_map global_ref_cnts_; - std::unordered_map runtime_ref_cnts_; + std::unordered_map> unused_vars_; + bool force_disable_gc_{false}; }; class Executor { @@ -66,7 +68,10 @@ class Executor { * Scope */ void Run(const ProgramDesc& prog, Scope* scope, int block_id, - bool create_local_scope = true, bool create_vars = true); + bool create_local_scope = true, bool create_vars = true, + const std::vector& skip_ref_cnt_vars = + std::vector(), + bool force_disable_gc = false); // This API is very slow. void Run(const ProgramDesc& program, Scope* scope, @@ -79,12 +84,14 @@ class Executor { static std::unique_ptr Prepare( const ProgramDesc& program, int block_id, const std::vector& skip_ref_cnt_vars = - std::vector()); + std::vector(), + bool force_disable_gc = false); static std::vector> Prepare( const ProgramDesc& program, const std::vector& block_ids, const std::vector>& skip_ref_cnt_vars = - std::vector>()); + std::vector>(), + bool force_disable_gc = false); void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id); diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..77b0977b5a47fdf4413e75c4e89cf638949e937f --- /dev/null +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -0,0 +1,189 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/executor_gc_helper.h" +#include +#include +#include +#include +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +struct OpInOutInfo { + public: + void Build(const OperatorBase *op) { + is_built_ = true; + auto &inferer = op->Info().NoNeedBufferVarsInferer(); + if (inferer) { + no_need_buffer_ins_ = inferer(op->Inputs(), op->Outputs(), op->Attrs()); + + if (no_need_buffer_ins_.empty()) return; + + for (auto &in_name_pair : op->Inputs()) { + if (no_need_buffer_ins_.count(in_name_pair.first) != 0) { + continue; + } + + for (auto &in_arg_name : in_name_pair.second) { + other_args_set_.insert(in_arg_name); + } + } + + for (auto &out_name_pair : op->Outputs()) { + for (auto &out_arg_name : out_name_pair.second) { + other_args_set_.insert(out_arg_name); + } + } + } + } + + bool IsBuilt() const { return is_built_; } + + bool IsInArgBufferNeeded(const std::string &in_arg_name) const { + return no_need_buffer_ins_.empty() || + other_args_set_.count(in_arg_name) != 0; + } + + private: + // A set to record unused buffer input vars of op + std::unordered_set no_need_buffer_ins_; + // A set to record other args of op (including in, out) + std::unordered_set other_args_set_; + bool is_built_{false}; +}; + +static bool VarCanBeDeleted(const std::string &name, const BlockDesc &block, + const std::unordered_set &skip_vars) { + if (skip_vars.count(name) != 0) { + return false; + } + + auto *var_desc = block.FindVar(name); + if (var_desc == nullptr || var_desc->Persistable()) { + return false; + } + + auto type = var_desc->Proto()->type().type(); + + return type == proto::VarType::LOD_TENSOR || + type == proto::VarType::SELECTED_ROWS || + type == proto::VarType::LOD_TENSOR_ARRAY; +} + +std::unordered_map> GetUnusedVars( + const BlockDesc &block, + const std::vector> &ops, + const std::vector &skip_var_list) { + std::unordered_set skip_vars(skip_var_list.begin(), + skip_var_list.end()); + + std::unordered_map var_op_idx_map; + + for (size_t i = 0; i < ops.size(); ++i) { + auto *op = ops[i].get(); + + OpInOutInfo info; + for (auto &name_pair : op->Inputs()) { + for (auto &name : name_pair.second) { + if (!VarCanBeDeleted(name, block, skip_vars)) { + continue; + } + + // var can be gc-ed + if (!info.IsBuilt()) { + info.Build(op); + } + + if (info.IsInArgBufferNeeded(name)) { + // Update the last living op of variable to current op + var_op_idx_map[name] = i; + } else { + VLOG(10) << "Skip reference count computing of variable " + << name_pair.first << "(" << name << ") in Operator " + << op->Type(); + } + } + } + + for (auto &name_pair : op->Outputs()) { + for (auto &name : name_pair.second) { + if (VarCanBeDeleted(name, block, skip_vars)) { + // Update the last living op of variable to current op + var_op_idx_map[name] = i; + } + } + } + } + + std::unordered_map> result; + for (auto &name_op_idx_pair : var_op_idx_map) { + auto &name = name_op_idx_pair.first; + size_t op_idx = name_op_idx_pair.second; + result[ops[op_idx].get()].emplace_back(name); + } + return result; +} + +void DeleteUnusedTensors( + const Scope &scope, OperatorBase *op, + const std::unordered_map> + &delete_vars_map, + GarbageCollector *gc) { + auto iter = delete_vars_map.find(op); + if (iter == delete_vars_map.end()) { + return; + } + + auto &delete_vars = iter->second; + + std::deque> garbages; + + for (auto &var_name : delete_vars) { + auto *var = scope.FindVar(var_name); + if (var == nullptr) { + continue; + } + + VLOG(2) << "Erase variable " << var_name; + if (var->IsType()) { + garbages.emplace_back(var->GetMutable()->MoveMemoryHolder()); + } else if (var->IsType()) { + garbages.emplace_back( + var->GetMutable()->mutable_value()->MoveMemoryHolder()); + } else if (var->IsType()) { + auto *lod_tensor_arr = var->GetMutable(); + for (auto &t : *lod_tensor_arr) { + garbages.emplace_back(t.MoveMemoryHolder()); + } + } else { + PADDLE_THROW("Type %s of %s is not supported eager deletion", + framework::ToTypeName(var->Type()), var_name); + } + } + + if (!garbages.empty()) { + gc->Add(std::move(garbages)); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/executor_gc_helper.h b/paddle/fluid/framework/executor_gc_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..8553273f8242844d0203d7bcd90ea2090b65826c --- /dev/null +++ b/paddle/fluid/framework/executor_gc_helper.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +// Result map: op -> variable names that can be deleted after op runs +std::unordered_map> GetUnusedVars( + const BlockDesc &block, + const std::vector> &ops, + const std::vector &skip_vars); + +// Collect unused tensors after op runs +void DeleteUnusedTensors( + const Scope &scope, OperatorBase *op, + const std::unordered_map> + &delete_vars_map, + GarbageCollector *gc); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 54d9d0dc018b08decb2ff8965659bab98e81f3ab..789b2ef80ec09a69ca227a27c61dd58e58a2fc04 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -13,14 +13,36 @@ // limitations under the License. #include +#include +#include +#include +#include // NOLINT +#include #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" #endif +#include "gflags/gflags.h" +#include "glog/logging.h" #include "paddle/fluid/framework/garbage_collector.h" namespace paddle { namespace framework { +DEFINE_double( + eager_delete_tensor_gb, -1.0, + "Memory size threshold (GB) when the garbage collector clear tensors." + "Disabled when this value is less than 0"); + +DEFINE_bool(fast_eager_deletion_mode, true, + "Fast eager deletion mode. If enabled, memory would release " + "immediately without waiting GPU kernel ends."); + +DEFINE_double(memory_fraction_of_eager_deletion, 1.0, + "Fraction of eager deletion. If less than 1.0, all variables in " + "the program would be sorted according to its memory size, and " + "only the FLAGS_memory_fraction_of_eager_deletion of the largest " + "variables would be deleted."); + GarbageCollector::GarbageCollector(const platform::Place &place, size_t max_memory_size) : max_memory_size_((std::max)(max_memory_size, static_cast(1))) { @@ -85,5 +107,25 @@ void StreamGarbageCollector::ClearCallback( callback_manager_->AddCallback(callback); } #endif + +int64_t GetEagerDeletionThreshold() { + return FLAGS_eager_delete_tensor_gb < 0 + ? -1 + : static_cast(FLAGS_eager_delete_tensor_gb * + (static_cast(1) << 30)); +} + +bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; } + +void SetEagerDeletionMode(double threshold, double fraction, bool fast_mode) { + FLAGS_eager_delete_tensor_gb = threshold; + FLAGS_memory_fraction_of_eager_deletion = fraction; + FLAGS_fast_eager_deletion_mode = fast_mode; +} + +double GetEagerDeletionMemoryFraction() { + return FLAGS_memory_fraction_of_eager_deletion; +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 2768671029c06562aa0d2e5eea3d3ff61d900ab5..f0b504627ae0cd99c8b4b15df3dcfc39a56507f2 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -18,6 +18,8 @@ #include #include #include // NOLINT +#include +#include "gflags/gflags.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { @@ -126,5 +128,12 @@ void GarbageCollector::Add(Container &&objs, Callback &&callback) { } } +int64_t GetEagerDeletionThreshold(); +bool IsFastEagerDeletionModeEnabled(); + +void SetEagerDeletionMode(double threshold, double fraction, bool fast_mode); + +double GetEagerDeletionMemoryFraction(); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h index 9bccb1a32bf63b30351ef4428594691b0eef0b6a..f2f4c53eea2150b68f15d2a655809d94611b2034 100644 --- a/paddle/fluid/framework/grad_op_desc_maker.h +++ b/paddle/fluid/framework/grad_op_desc_maker.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once #include +#include #include +#include #include #include #include "paddle/fluid/framework/op_desc.h" @@ -55,11 +57,11 @@ class GradOpDescMakerBase { std::back_inserter(ret_val), [this](const std::string& fwd_var_name) -> std::string { auto g_name = GradVarName(fwd_var_name); - if (no_grad_set_.count(g_name)) { - return kEmptyVarName; - } else { + if (no_grad_set_.empty() || !no_grad_set_.count(g_name)) { (*this->grad_to_var_)[g_name] = fwd_var_name; return g_name; + } else { + return kEmptyVarName; } }); if (!drop_empty_grad) { diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h index a3ccf677c90e8466f6c89041979336d45c1ac942..df46d4f9a805b6e497a6f939e91ecf7dc395e7f0 100644 --- a/paddle/fluid/framework/inplace_op_inference.h +++ b/paddle/fluid/framework/inplace_op_inference.h @@ -17,8 +17,8 @@ #include #include #include +#include #include "glog/logging.h" -#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/type_defs.h" @@ -32,55 +32,22 @@ namespace framework { then Out will inplaced use X's memory. The base class will do legality validation for both variables. */ + class InplaceOpInference { public: virtual ~InplaceOpInference() {} virtual std::unordered_map operator()( - const OpDesc& op_desc, BlockDesc* block) const = 0; -}; - -class InplaceInToOut : public InplaceOpInference { - public: - std::unordered_map operator()( - const OpDesc& op_desc, BlockDesc* block) const { - std::unordered_map ret; - auto in_out_var_names_pair = this->Apply(op_desc, block); - for (auto& pair : in_out_var_names_pair) { - PADDLE_ENFORCE(!op_desc.Input(pair.first).empty(), - string::Sprintf("op %s do not have input of %s!", - op_desc.Type(), pair.first)); - PADDLE_ENFORCE(!op_desc.Output(pair.second).empty(), - string::Sprintf("op %s do not have output of %s!", - op_desc.Type(), pair.second)); - auto& in_name = op_desc.Input(pair.first).at(0); - auto& out_name = op_desc.Output(pair.second).at(0); - - auto in = block->FindRecursiveOrCreateVar(in_name); - auto out = block->FindRecursiveOrCreateVar(out_name); - if (TryInplaceInputOutput(in, out)) ret.insert({in_name, out_name}); - } - return ret; - } - - protected: - virtual std::unordered_map Apply( - const OpDesc& op_desc, BlockDesc* block) const = 0; - - bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const { - return in.Name() != out.Name() && details::NodeCanReused(in) && - details::NodeCanReused(out) && - details::NodeSize(out) <= details::NodeSize(in); - } + const OpDesc& op_desc) const = 0; }; /* Inplace In and Out for operator only have an Input and an Output. For example, activation op. */ -class SingleOpInplaceInToOut : public InplaceInToOut { - protected: - std::unordered_map Apply( - const OpDesc& op_desc, BlockDesc* block) const override { +class SingleOpInplaceInToOut : public InplaceOpInference { + public: + std::unordered_map operator()( + const OpDesc& op_desc) const override { PADDLE_ENFORCE(!op_desc.InputNames().empty(), "Op inputs must not be empty"); PADDLE_ENFORCE(!op_desc.OutputNames().empty(), @@ -95,10 +62,10 @@ class SingleOpInplaceInToOut : public InplaceInToOut { Gradient op. Inplace output use it's Input. For example, Input@Grad->Input reuse strategy. */ -class GradOpInplaceInToOut : public InplaceInToOut { - protected: - std::unordered_map Apply( - const OpDesc& op_desc, BlockDesc* block) const override { +class GradOpInplaceInToOut : public InplaceOpInference { + public: + std::unordered_map operator()( + const OpDesc& op_desc) const override { std::unordered_map ret; std::unordered_set output_names(op_desc.OutputNames().begin(), op_desc.OutputNames().end()); diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc index bf9d1dcd380cdff886301faf13b0015fd5a2ed5c..c93e562955fb36ddc4363fac862f3942758af35d 100644 --- a/paddle/fluid/framework/inplace_op_inference_test.cc +++ b/paddle/fluid/framework/inplace_op_inference_test.cc @@ -127,26 +127,20 @@ class MultiOutGradShapeInference : public framework::InferShapeBase { } }; -class MultiOutInplaceInToOut : public framework::InplaceInToOut { +class MultiOutInplaceInToOut : public framework::InplaceOpInference { public: - using framework::InplaceInToOut::InplaceInToOut; - - protected: - std::unordered_map Apply( - const OpDesc& op_desc, BlockDesc* block) const override { + std::unordered_map operator()( + const OpDesc& op_desc) const override { return std::unordered_map{ {"X", "Out"}, {"Y", "YOut"}, {"Z", "ZOut"}, }; } }; -class MultiOutGradInplaceInToOut : public framework::InplaceInToOut { +class MultiOutGradInplaceInToOut : public framework::InplaceOpInference { public: - using framework::InplaceInToOut::InplaceInToOut; - - protected: - std::unordered_map Apply( - const OpDesc& op_desc, BlockDesc* block) const override { + std::unordered_map operator()( + const OpDesc& op_desc) const override { return std::unordered_map{ {framework::GradVarName("YOut"), framework::GradVarName("Y")}, {framework::GradVarName("Out"), framework::GradVarName("X")}, @@ -171,118 +165,118 @@ REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut, namespace paddle { namespace framework { -TEST(InferInplace, SingleOpInplaceInToOut) { - ProgramDesc prog; - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("single_op"); - op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); - op->SetOutput("Out", {"test2_out"}); - - prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128}); - prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("test2_out"); - prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128}); - - auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; - auto in_to_outs = infer_inplace(*op, op->Block()); - EXPECT_EQ(in_to_outs.size(), 1ul); - auto it = in_to_outs.begin(); - EXPECT_EQ(it->first, "test2_a"); - EXPECT_EQ(it->second, "test2_out"); -} - -TEST(InferInplace, SingleGradOpInplaceInToOut) { - ProgramDesc prog; - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("single_op_grad"); - op->SetInput(GradVarName("Out"), {"test2_out"}); - op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"}); - - prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("test2_out"); - prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024}); - - auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; - auto in_to_outs = infer_inplace(*op, op->Block()); - EXPECT_EQ(in_to_outs.size(), 1ul); - auto it = in_to_outs.begin(); - EXPECT_EQ(it->first, "test2_out"); - EXPECT_EQ(it->second, "test2_a"); -} - -TEST(InferInplace, MultiOutInplaceInToOut) { - ProgramDesc prog; - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("multi_out_op"); - op->SetInput("X", {"a0", "a1"}); - op->SetInput("Y", {"b0"}); - op->SetInput("Z", {"c0", "c1"}); - op->SetOutput("Out", {"o0"}); - op->SetOutput("YOut", {"y0"}); - op->SetOutput("ZOut", {"z0"}); - - prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("o0"); - prog.MutableBlock(0)->Var("y0"); - prog.MutableBlock(0)->Var("z0"); - prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); - - auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; - auto in_to_outs = infer_inplace(*op, op->Block()); - EXPECT_EQ(in_to_outs.size(), 3ul); - std::unordered_map expects = { - {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"}, - }; - EXPECT_TRUE(expects == in_to_outs); -} - -TEST(InferInplace, MultiGradInplaceInToOut) { - ProgramDesc prog; - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("multi_out_grad"); - op->SetInput(GradVarName("Out"), {"o0"}); - op->SetInput(GradVarName("YOut"), {"y0"}); - op->SetInput(GradVarName("ZOut"), {"z0"}); - op->SetOutput(GradVarName("X"), {"a0", "a1"}); - op->SetOutput(GradVarName("Y"), {"b0"}); - op->SetOutput(GradVarName("Z"), {"c0", "c1"}); - - prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("o0"); - prog.MutableBlock(0)->Var("y0"); - prog.MutableBlock(0)->Var("z0"); - prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); - - auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; - auto in_to_outs = infer_inplace(*op, op->Block()); - - EXPECT_EQ(in_to_outs.size(), 3ul); - std::unordered_map expects = { - {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, - }; - EXPECT_TRUE(expects == in_to_outs); -} +// TEST(InferInplace, SingleOpInplaceInToOut) { +// ProgramDesc prog; +// auto* op = prog.MutableBlock(0)->AppendOp(); +// op->SetType("single_op"); +// op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); +// op->SetOutput("Out", {"test2_out"}); +// +// prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); +// prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128}); +// prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); +// prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); +// prog.MutableBlock(0)->Var("test2_out"); +// prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128}); +// +// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; +// auto in_to_outs = infer_inplace(*op); +// EXPECT_EQ(in_to_outs.size(), 1ul); +// auto it = in_to_outs.begin(); +// EXPECT_EQ(it->first, "test2_a"); +// EXPECT_EQ(it->second, "test2_out"); +// } +// +// TEST(InferInplace, SingleGradOpInplaceInToOut) { +// ProgramDesc prog; +// auto* op = prog.MutableBlock(0)->AppendOp(); +// op->SetType("single_op_grad"); +// op->SetInput(GradVarName("Out"), {"test2_out"}); +// op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"}); +// +// prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); +// prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024}); +// prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); +// prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); +// prog.MutableBlock(0)->Var("test2_out"); +// prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024}); +// +// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; +// auto in_to_outs = infer_inplace(*op); +// EXPECT_EQ(in_to_outs.size(), 1ul); +// auto it = in_to_outs.begin(); +// EXPECT_EQ(it->first, "test2_out"); +// EXPECT_EQ(it->second, "test2_a"); +// } +// +// TEST(InferInplace, MultiOutInplaceInToOut) { +// ProgramDesc prog; +// auto* op = prog.MutableBlock(0)->AppendOp(); +// op->SetType("multi_out_op"); +// op->SetInput("X", {"a0", "a1"}); +// op->SetInput("Y", {"b0"}); +// op->SetInput("Z", {"c0", "c1"}); +// op->SetOutput("Out", {"o0"}); +// op->SetOutput("YOut", {"y0"}); +// op->SetOutput("ZOut", {"z0"}); +// +// prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); +// prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); +// prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); +// prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); +// prog.MutableBlock(0)->Var("o0"); +// prog.MutableBlock(0)->Var("y0"); +// prog.MutableBlock(0)->Var("z0"); +// prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); +// prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); +// prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); +// prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); +// prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); +// prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); +// +// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; +// auto in_to_outs = infer_inplace(*op); +// EXPECT_EQ(in_to_outs.size(), 3ul); +// std::unordered_map expects = { +// {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"}, +// }; +// EXPECT_TRUE(expects == in_to_outs); +// } +// +// TEST(InferInplace, MultiGradInplaceInToOut) { +// ProgramDesc prog; +// auto* op = prog.MutableBlock(0)->AppendOp(); +// op->SetType("multi_out_grad"); +// op->SetInput(GradVarName("Out"), {"o0"}); +// op->SetInput(GradVarName("YOut"), {"y0"}); +// op->SetInput(GradVarName("ZOut"), {"z0"}); +// op->SetOutput(GradVarName("X"), {"a0", "a1"}); +// op->SetOutput(GradVarName("Y"), {"b0"}); +// op->SetOutput(GradVarName("Z"), {"c0", "c1"}); +// +// prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); +// prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); +// prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); +// prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); +// prog.MutableBlock(0)->Var("o0"); +// prog.MutableBlock(0)->Var("y0"); +// prog.MutableBlock(0)->Var("z0"); +// prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); +// prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); +// prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); +// prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); +// prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); +// prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); +// +// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; +// auto in_to_outs = infer_inplace(*op); +// +// EXPECT_EQ(in_to_outs.size(), 3ul); +// std::unordered_map expects = { +// {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, +// }; +// EXPECT_TRUE(expects == in_to_outs); +// } } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index ca6b0229e906c0f8bfbf9ee6781013cb4ef7bbce..81b8ffa83f612f5b67cd91a7a2c1228519a1fbb7 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -66,22 +66,33 @@ pass_library(conv_elementwise_add_fuse_pass inference) pass_library(conv_affine_channel_fuse_pass inference) pass_library(transpose_flatten_concat_fuse_pass inference) pass_library(identity_scale_op_clean_pass base) +pass_library(sync_batch_norm_pass base) +pass_library(runtime_context_cache_pass base) +pass_library(simplify_anakin_detection_pattern_pass inference) +pass_library(anakin_fillconstant_elementwisemul_fuse inference) # There may be many transpose-flatten structures in a model, and the output of # these structures will be used as inputs to the concat Op. This pattern will # be detected by our pass. The index here represents the number of structures in the # pattern. We use index 3 ~ 6, because these quantities of structures are # common in the models. -foreach (index RANGE 3 6) +foreach (index RANGE 2 6) file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n") endforeach() +foreach (index RANGE 2 6) + file(APPEND ${pass_file} "USE_PASS(simplify_anakin_detection_pattern_pass${index});\n") +endforeach() + if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base mkldnn) pass_library(depthwise_conv_mkldnn_pass base mkldnn) pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn) pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn) pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn) + pass_library(cpu_quantize_placement_pass base mkldnn) + pass_library(cpu_quantize_pass inference mkldnn) + pass_library(cpu_quantize_squash_pass inference mkldnn) endif() cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) @@ -100,10 +111,16 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto) cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) +if(NOT WIN32) + cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass) +endif() if (WITH_MKLDNN) cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass) cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass) + cc_test(test_cpu_quantize_placement_pass SRCS mkldnn/cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass) + cc_test(test_cpu_quantize_pass SRCS mkldnn/cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor) + cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor) endif () diff --git a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc new file mode 100644 index 0000000000000000000000000000000000000000..83b0da0c0118a856e54d744607cee8b421f330a3 --- /dev/null +++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(fill_constant); \ + GET_IR_NODE(fill_constant_out); \ + GET_IR_NODE(elementwise_mul); \ + GET_IR_NODE(elementwise_mul_out); + +std::unique_ptr AnakinFillconstantElementwisemulFuse::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "anakin_fillconstant_elementwisemul_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("x") + ->assert_is_op_input("elementwise_mul", "X") + ->AsInput(); + + patterns::AnakinFillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(), + pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + PADDLE_ENFORCE(subgraph.count(x)); + auto* elementwise_in = subgraph.at(x); + float constant_value = + boost::get(fill_constant->Op()->GetAttr("value")); + + framework::OpDesc new_op_desc; + new_op_desc.SetType("scale"); + new_op_desc.SetInput("X", {elementwise_in->Name()}); + new_op_desc.SetAttr("scale", constant_value); + new_op_desc.SetAttr("bias", static_cast(0.0)); + new_op_desc.SetAttr("bias_after_scale", true); + new_op_desc.SetOutput("Out", {elementwise_mul_out->Name()}); + new_op_desc.Flush(); + + // Create a new node for the fused op. + auto* scale_op = graph->CreateOpNode(&new_op_desc); + + IR_NODE_LINK_TO(elementwise_in, scale_op); // Input + IR_NODE_LINK_TO(scale_op, elementwise_mul_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), + {fill_constant, fill_constant_out, elementwise_mul}); + }; + + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(anakin_fillconstant_elementwisemul_fuse, + paddle::framework::ir::AnakinFillconstantElementwisemulFuse); diff --git a/paddle/fluid/framework/details/eager_deletion_pass.h b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h similarity index 73% rename from paddle/fluid/framework/details/eager_deletion_pass.h rename to paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h index d7a7a9709d970841060778806451bc21cb2c7571..fa95143d3adae3e3eeb913af09986fb4a401bd73 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.h +++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h @@ -13,20 +13,23 @@ // limitations under the License. #pragma once - -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { namespace framework { -namespace details { +namespace ir { + +class AnakinFillconstantElementwisemulFuse : public FusePassBase { + public: + virtual ~AnakinFillconstantElementwisemulFuse() {} -class EagerDeletionPass : public ir::Pass { protected: std::unique_ptr ApplyImpl( std::unique_ptr graph) const override; }; -} // namespace details +} // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h index c53b2a6186741d86f14faf1d21fa19aa09cec036..3a1022bbcbd671391fb034bdff7c3cf97952f84d 100644 --- a/paddle/fluid/framework/ir/fuse_pass_base.h +++ b/paddle/fluid/framework/ir/fuse_pass_base.h @@ -14,6 +14,7 @@ #pragma once +#include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/scope.h" @@ -24,6 +25,10 @@ namespace ir { static const char kParamScopeAttr[] = "__param_scope__"; static const char kFuseStatisAttr[] = "__fuse_statis__"; +// When we use trt or other third_party lib, the parameters are managed by +// the lib, but not the fluid. So we need to record them to avoid duplicate +// allocation. +static const char kRepetitiveParamAttr[] = "__repetitive_param__"; enum FuseOptions { DO_NOT_FUSE, // fusing will not be done diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 5e954fa9c419b249bb8a4be5a78c01da85b017b2..6a9340b870df324f7dea03181bdb2b097e13e705 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include +#include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/op_proto_maker.h" @@ -152,6 +152,39 @@ void Graph::ResolveHazard( } } +std::shared_ptr Graph::Clone() { + auto cloned_graph = std::make_shared(this->program_); + cloned_graph->ReleaseNodes(); + cloned_graph->num_node_created_ = 0; + std::unordered_map origin_to_cloned; + for (auto *n : this->node_set_) { + ir::Node *cloned_node = nullptr; + if (n->IsCtrlVar()) { + cloned_node = cloned_graph->CreateControlDepVar(); + } else if (!n->var_desc_ && !n->op_desc_) { // empty node + cloned_node = cloned_graph->CreateEmptyNode(n->Name(), n->NodeType()); + } else if (n->IsVar()) { + cloned_node = cloned_graph->CreateVarNode(n->Var()); + } else if (n->IsOp()) { + cloned_node = cloned_graph->CreateOpNode(n->Op()); + } + if (cloned_node) { + origin_to_cloned[n] = cloned_node; + } else { + PADDLE_THROW("The cloned node's type is not supported!"); + } + } + for (auto *n : this->node_set_) { + for (auto it = n->inputs.begin(); it != n->inputs.end(); it++) { + origin_to_cloned[n]->inputs.push_back(origin_to_cloned[*it]); + } + for (auto it = n->outputs.begin(); it != n->outputs.end(); it++) { + origin_to_cloned[n]->outputs.push_back(origin_to_cloned[*it]); + } + } + return cloned_graph; +} + bool IsControlDepVar(const ir::Node &var) { return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos; } diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index cfd974e4bd679fdd06739f4c943bb197865020fb..fff015d4a6f0c631017458ceb039ae3f1deb0e2c 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include #include "paddle/fluid/framework/ir/node.h" @@ -199,7 +200,12 @@ class Graph { // WARN: After a series of passes, the current graph can be quite // different from OriginProgram. Caller shouldn't assume much from // the returned OriginProgram. - const ProgramDesc &OriginProgram() const { return program_; } + const ProgramDesc &OriginProgram() const { + LOG(WARNING) << "WARN: After a series of passes, the current graph can be " + "quite different from OriginProgram. So, please avoid " + "using the `OriginProgram()` method!"; + return program_; + } // This method takes ownership of `node`. ir::Node *AddNode(ir::Node *node) { @@ -212,6 +218,10 @@ class Graph { void ResolveHazard( const std::map> &var_nodes); + // Create a new and duplicated graph. + // WARN: The method only clones the graph structure, not its attributes. + std::shared_ptr Clone(); + private: std::map> InitFromProgram( const ProgramDesc &program); diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 22d4c0a91cc1638264a8c57aa2841ff4e65a1400..28a37f331c100695f0ffec7288db84f4493d68a0 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -130,15 +130,21 @@ std::map> BuildOperationAdjList( if (adj_list.find(n) == adj_list.end()) { adj_list[n] = std::unordered_set(); } + std::vector nodes; for (auto &var : n->inputs) { for (auto &adj_n : var->inputs) { PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) << " -> " << n->Name() << reinterpret_cast(n) << " via " << var->Name() << reinterpret_cast(var); - adj_list[n].insert(adj_n); + nodes.push_back(adj_n); } } + std::sort(nodes.begin(), nodes.end(), [](ir::Node *node1, ir::Node *node2) { + return node1->id() > node2->id(); + }); + adj_list[n].insert(std::make_move_iterator(nodes.begin()), + std::make_move_iterator(nodes.end())); } return adj_list; } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index c0c34d186b00814fe6c6fd42beb78133233a1357..555fdc7b7a03ebc99fcc77a26341d291dac2c308 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -90,7 +90,8 @@ void GraphPatternDetector::operator()(Graph *graph, ValidateByNodeRole(&subgraphs); if (subgraphs.empty()) return; - PrettyLogEndl(Style::detail(), "--- detect %d subgraphs", subgraphs.size()); + PrettyLogEndl(Style::detail(), "--- detected %d subgraphs", + subgraphs.size()); int id = 0; for (auto &g : subgraphs) { VLOG(3) << "optimizing #" << id++ << " subgraph"; @@ -598,10 +599,19 @@ bool VarLinksToOp(Node *node, const std::string &op_type) { bool IsNthInput(Node *var, Node *op, const std::string &argument, size_t nth) { PADDLE_ENFORCE(var->IsVar()); PADDLE_ENFORCE(op->IsOp()); - if (op->Op()->Input(argument).size() <= nth) return false; + if (!HasInput(op, argument) || op->Op()->Input(argument).size() <= nth) + return false; return var->Name() == op->Op()->Input(argument)[nth]; } +bool HasInput(Node *op, const std::string &argument) { + PADDLE_ENFORCE(op->IsOp()); + auto const &names = op->Op()->InputNames(); + if (std::find(names.begin(), names.end(), argument) == names.end()) + return false; + return true; +} + bool IsNthOutput(Node *var, Node *op, const std::string &argument, size_t nth) { PADDLE_ENFORCE(var->IsVar()); PADDLE_ENFORCE(op->IsOp()); @@ -1074,9 +1084,60 @@ PDNode *patterns::Conv::operator()() { ->AsOutput() ->assert_is_op_output("conv2d", "Output"); - conv_op->LinksFrom({input_var, filter_var}); - conv_op->LinksTo({output_var}); + conv_op->LinksFrom({input_var, filter_var}).LinksTo({output_var}); + return output_var; +} + +PDNode *patterns::ConvResidual::operator()(bool with_residual_data) { + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + + if (!with_residual_data) { + conv_op->assert_more([&](Node *x) { + auto node_names = x->Op()->InputNames(); + if (!HasInput(x, "ResidualData") || + x->Op()->Input("ResidualData").size() == 0) + return true; + return false; + }); + } + + auto input_var = pattern->NewNode(conv_input_repr()) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + + auto filter_var = pattern->NewNode(conv_filter_repr()) + ->AsInput() + ->assert_is_op_input("conv2d", "Filter"); + auto output_var = pattern->NewNode(conv_output_repr()) + ->AsOutput() + ->assert_is_op_output("conv2d", "Output"); + + std::vector links_from{input_var, filter_var}; + + if (with_residual_data) { + auto res_conn_var = pattern->NewNode(conv_residual_data_repr()) + ->AsInput() + ->assert_is_op_input("conv2d", "ResidualData"); + links_from.push_back(res_conn_var); + } + + conv_op->LinksFrom(links_from).LinksTo({output_var}); + return output_var; +} + +PDNode *patterns::Pool::operator()() { + auto pool_op = pattern->NewNode(pool_op_repr())->assert_is_op("pool2d"); + + auto input_var = pattern->NewNode(pool_input_repr()) + ->AsInput() + ->assert_is_op_input("pool2d", "X"); + + auto output_var = pattern->NewNode(pool_output_repr()) + ->AsOutput() + ->assert_is_op_output("pool2d", "Out"); + + pool_op->LinksFrom({input_var}).LinksTo({output_var}); return output_var; } @@ -1301,6 +1362,51 @@ PDNode *patterns::ConvAffineChannel::operator()( return ac_out_var; } +PDNode *patterns::DequantQuantAny::operator()() { + auto *dequant_in = pattern->NewNode(dequant_in_repr()) + ->AsInput() + ->assert_is_op_input("dequantize", "Input"); + + auto *dequant_op = + pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize"); + + auto *dequant_out = pattern->NewNode(dequant_out_repr()) + ->AsOutput() + ->assert_is_op_output("dequantize", "Output"); + + auto *quant_op = pattern->NewNode(quant_op_repr()) + ->assert_is_op("quantize") + ->AsIntermediate(); + + auto *quant_out = pattern->NewNode(quant_out_repr()) + ->AsOutput() + ->assert_is_op_output("quantize"); + + auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op(); + + dequant_op->LinksFrom({dequant_in}).LinksTo({dequant_out}); + quant_op->LinksFrom({dequant_out}).LinksTo({quant_out}); + next_op->LinksFrom({quant_out}); + + return quant_out; +} + +PDNode *patterns::DequantAny::operator()() { + auto *dequant_op = + pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize"); + + auto *dequant_out = pattern->NewNode(dequant_out_repr()) + ->AsOutput() + ->assert_is_op_output("dequantize", "Output"); + + auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op(); + + dequant_op->LinksTo({dequant_out}); + next_op->LinksFrom({dequant_out}); + + return dequant_out; +} + // a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a // b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b // ... @@ -1364,6 +1470,171 @@ PDNode *patterns::TransposeFlattenConcat::operator()( return concat_out; } +PDNode *patterns::AnakinDetectionPattern::operator()( + std::vector conv_in, int times) { + // The times represents the repeat times of the + // {prior_box, prior_box_loc_out, flatten, prior_box_var_out, reshape} + const int kNumFields = 7; + const int kPriorBoxLocOffset = 1; + const int kReshape1Offset = 2; + const int kReshape1OutOffset = 3; + const int kPriorBoxVarOffset = 4; + const int kReshape2Offset = 5; + const int kReshape2OutOffset = 6; + + const int kBoxCoderThirdInputOffset = times; + const int kMultiClassSecondInputNmsOffset = times + 1; + + std::vector nodes; + + for (int i = 0; i < times; i++) { + nodes.push_back( + pattern->NewNode(GetNodeName("prior_box" + std::to_string(i))) + ->assert_is_op("density_prior_box")); + nodes.push_back(pattern->NewNode(GetNodeName("box_out" + std::to_string(i))) + ->assert_is_op_output("density_prior_box", "Boxes") + ->assert_is_op_input("reshape2", "X") + ->AsIntermediate()); + nodes.push_back( + pattern->NewNode(GetNodeName("reshape1" + std::to_string(i))) + ->assert_is_op("reshape2")); + + nodes.push_back( + pattern->NewNode(GetNodeName("reshape1_out" + std::to_string(i))) + ->assert_is_op_output("reshape2") + ->assert_is_op_nth_input("concat", "X", i) + ->AsIntermediate()); + + nodes.push_back( + pattern->NewNode(GetNodeName("box_var_out" + std::to_string(i))) + ->assert_is_op_output("density_prior_box", "Variances") + ->assert_is_op_input("reshape2", "X") + ->AsIntermediate()); + nodes.push_back( + pattern->NewNode(GetNodeName("reshape2" + std::to_string(i))) + ->assert_is_op("reshape2")); + + nodes.push_back( + pattern->NewNode(GetNodeName("reshape2_out" + std::to_string(i))) + ->assert_is_op_output("reshape2") + ->assert_is_op_nth_input("concat", "X", i) + ->AsIntermediate()); + } + + auto concat_op1 = pattern->NewNode(GetNodeName("concat1")) + ->assert_is_op("concat") + ->assert_op_has_n_inputs("concat", times); + auto concat_out1 = pattern->NewNode(GetNodeName("concat1_out")) + ->assert_is_op_output("concat") + ->AsIntermediate(); + + auto concat_op2 = pattern->NewNode(GetNodeName("concat2")) + ->assert_is_op("concat") + ->assert_op_has_n_inputs("concat", times); + auto concat_out2 = pattern->NewNode(GetNodeName("concat2_out")) + ->assert_is_op_output("concat") + ->AsIntermediate(); + + auto box_coder_op = pattern->NewNode(GetNodeName("box_coder")) + ->assert_is_op("box_coder") + ->assert_op_has_n_inputs("box_coder", 3); + + auto box_coder_out = pattern->NewNode(GetNodeName("box_coder_out")) + ->assert_is_op_output("box_coder") + ->AsIntermediate(); + + auto transpose_before_nms = + pattern->NewNode(GetNodeName("transpose_before_nms")) + ->assert_is_op("transpose2"); + + auto transpose_before_nms_out = + pattern->NewNode(GetNodeName("transpose_before_nms_out")) + ->assert_is_op_output("transpose2") + ->assert_is_op_input("multiclass_nms", "Scores") + ->AsIntermediate(); + + auto multiclass_nms_op = pattern->NewNode(GetNodeName("multiclass_nms")) + ->assert_is_op("multiclass_nms") + ->assert_op_has_n_inputs("multiclass_nms", 2); + + auto multiclass_nms_out = pattern->NewNode(GetNodeName("multiclass_nms_out")) + ->assert_is_op_output("multiclass_nms") + ->AsOutput(); + + std::vector reshape1_outs; + std::vector reshape2_outs; + + for (int i = 0; i < times; i++) { + conv_in[i]->AsInput(); + // prior_box + nodes[i * kNumFields]->LinksFrom({conv_in[i]}); + // prior_box box out + nodes[i * kNumFields + kPriorBoxLocOffset]->LinksFrom( + {nodes[i * kNumFields]}); + // reshape + nodes[i * kNumFields + kReshape1Offset]->LinksFrom( + {nodes[i * kNumFields + kPriorBoxLocOffset]}); + // reshape_out + nodes[i * kNumFields + kReshape1OutOffset]->LinksFrom( + {nodes[i * kNumFields + kReshape1Offset]}); + + nodes[i * kNumFields + kPriorBoxVarOffset]->LinksFrom( + {nodes[i * kNumFields]}); + // reshape + nodes[i * kNumFields + kReshape2Offset]->LinksFrom( + {nodes[i * kNumFields + kPriorBoxVarOffset]}); + // reshape_out + nodes[i * kNumFields + kReshape2OutOffset]->LinksFrom( + {nodes[i * kNumFields + kReshape2Offset]}); + + reshape1_outs.push_back(nodes[i * kNumFields + kReshape1OutOffset]); + reshape2_outs.push_back(nodes[i * kNumFields + kReshape2OutOffset]); + } + + concat_op1->LinksFrom(reshape1_outs); + concat_op2->LinksFrom(reshape2_outs); + concat_out1->LinksFrom({concat_op1}); + concat_out2->LinksFrom({concat_op2}); + + conv_in[kBoxCoderThirdInputOffset]->AsInput(); + conv_in[kMultiClassSecondInputNmsOffset]->AsInput(); + + box_coder_op->LinksFrom( + {concat_out1, concat_out2, conv_in[kBoxCoderThirdInputOffset]}); + box_coder_out->LinksFrom({box_coder_op}); + + transpose_before_nms->LinksFrom({conv_in[kMultiClassSecondInputNmsOffset]}); + transpose_before_nms_out->LinksFrom({transpose_before_nms}); + + multiclass_nms_op->LinksFrom({box_coder_out, transpose_before_nms_out}) + .LinksTo({multiclass_nms_out}); + + return multiclass_nms_out; +} + +PDNode *patterns::AnakinFillConstantElementWiseMulFuse::operator()( + PDNode *elementwise_op_input) { + auto fill_constant = + pattern->NewNode(fill_constant_repr())->assert_is_op("fill_constant"); + + auto fill_constant_out = pattern->NewNode(fill_constant_out_repr()) + ->assert_is_op_output("fill_constant") + ->assert_is_op_input("elementwise_mul", "Y") + ->AsIntermediate(); + + auto elementwise_mul_op = + pattern->NewNode(elementwise_mul_repr())->assert_is_op("elementwise_mul"); + + auto elementwise_mul_out = pattern->NewNode(elementwise_mul_out_repr()) + ->assert_is_op_output("elementwise_mul") + ->AsOutput(); + + fill_constant_out->LinksFrom({fill_constant}); + elementwise_mul_op->LinksFrom({elementwise_op_input, fill_constant_out}); + elementwise_mul_out->LinksFrom({elementwise_mul_op}); + return elementwise_mul_out; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index c8be586f546dc604375401b13a801841efbf08d2..130ddeac4cd1a38516540d175e17d46f877bd909 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -18,8 +18,11 @@ #include #endif +#include #include #include +#include +#include #include #include #include "paddle/fluid/framework/ir/graph.h" @@ -302,6 +305,9 @@ bool VarLinksFromOp(Node* node, const std::string& op_type); // Check whether a var node is a op node's nth input. bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth); +// Check whether the op node has input of given name. +bool HasInput(Node* op, const std::string& argument); + // Tell whether a var node is a op node's nth output. bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth); @@ -656,6 +662,35 @@ struct Conv : public PatternBase { PATTERN_DECL_NODE(conv_output); }; +// Convolution op with residual data +struct ConvResidual : public PatternBase { + ConvResidual(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_residual") {} + + PDNode* operator()(bool with_residual_data); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_input); + PATTERN_DECL_NODE(conv_filter); + PATTERN_DECL_NODE(conv_residual_data); + PATTERN_DECL_NODE(conv_output); +}; + +// Pool op +// Forward pass for pooling. +// pool_input is the input. +// pool_output is a result of the operator. +struct Pool : public PatternBase { + Pool(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "pooling") {} + + PDNode* operator()(); + + PATTERN_DECL_NODE(pool_op); + PATTERN_DECL_NODE(pool_input); + PATTERN_DECL_NODE(pool_output); +}; + // ElementwiseAdd used in residual connections. // y_var is used and convolution output. // The operator is removed, when residual @@ -766,6 +801,34 @@ struct ConvAffineChannel : public PatternBase { PATTERN_DECL_NODE(ac_out); // Out }; +// Dequantize + Quantize + anyOP +// This pattern is used for squashing the dequantize-quantize pairs. +struct DequantQuantAny : public PatternBase { + DequantQuantAny(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "dequant_quant_any") {} + PDNode* operator()(); + + PATTERN_DECL_NODE(dequant_in); + PATTERN_DECL_NODE(dequant_op); + PATTERN_DECL_NODE(dequant_out); + PATTERN_DECL_NODE(quant_op); + PATTERN_DECL_NODE(quant_out); + PATTERN_DECL_NODE(next_op); +}; + +// Dequantize + anyOP +// This quantize is used for getting number of ops the Dequantize's +// output is an input to. +struct DequantAny : public PatternBase { + DequantAny(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "dequant_any") {} + PDNode* operator()(); + + PATTERN_DECL_NODE(dequant_op); + PATTERN_DECL_NODE(dequant_out); + PATTERN_DECL_NODE(next_op); +}; + struct TransposeFlattenConcat : public PatternBase { TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope) : PatternBase(pattern, name_scope, "transpose_flatten_concat") {} @@ -781,6 +844,36 @@ struct TransposeFlattenConcat : public PatternBase { } }; +struct AnakinDetectionPattern : public PatternBase { + AnakinDetectionPattern(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "anakin_detect_pattern") {} + + PDNode* operator()(std::vector conv_inputs, int times); + + std::string GetNodeName(const std::string& op_type) { + return PDNodeName(name_scope_, repr_, id_, op_type); + } + + PDNode* GetPDNode(const std::string& op_type) { + return pattern->RetrieveNode(GetNodeName(op_type)); + } +}; + +struct AnakinFillConstantElementWiseMulFuse : public PatternBase { + AnakinFillConstantElementWiseMulFuse(PDPattern* pattern, + const std::string& name_scope) + : PatternBase(pattern, name_scope, + "anakin_fillconstant_elementwisemul_fuse") {} + + PDNode* operator()(PDNode* elementwise_op_input); + + // declare operator node's name + PATTERN_DECL_NODE(fill_constant); + PATTERN_DECL_NODE(fill_constant_out); + PATTERN_DECL_NODE(elementwise_mul); + PATTERN_DECL_NODE(elementwise_mul_out); +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc index 7ed2f96eb24239d87965192d73f4ba200ff5dbeb..a95588a57b434763fb0f01e33528ef15fd1aa42b 100644 --- a/paddle/fluid/framework/ir/graph_test.cc +++ b/paddle/fluid/framework/ir/graph_test.cc @@ -43,20 +43,20 @@ class SumOpMaker : public OpProtoAndCheckerMaker { class SumOpVarTypeInference : public VarTypeInference { public: - void operator()(const OpDesc &op_desc, BlockDesc *block) const override { - auto &inputs = op_desc.Input("X"); + void operator()(InferVarTypeContext *ctx) const override { + auto &inputs = ctx->Input("X"); auto default_var_type = proto::VarType::SELECTED_ROWS; bool any_input_is_lod_tensor = std::any_of( - inputs.begin(), inputs.end(), [block](const std::string &name) { - return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR; + inputs.begin(), inputs.end(), [&ctx](const std::string &name) { + return ctx->GetType(name) == proto::VarType::LOD_TENSOR; }); if (any_input_is_lod_tensor) { default_var_type = proto::VarType::LOD_TENSOR; } - auto out_var_name = op_desc.Output("Out").front(); - block->Var(out_var_name)->SetType(default_var_type); + auto out_var_name = ctx->Output("Out").front(); + ctx->SetType(out_var_name, default_var_type); } }; @@ -71,7 +71,7 @@ class DummyOpMaker : public OpProtoAndCheckerMaker { class DummyOpVarTypeInference : public VarTypeInference { public: - void operator()(const OpDesc &op_desc, BlockDesc *block) const override {} + void operator()(framework::InferVarTypeContext *ctx) const override {} }; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..b3a8c208916f699dc032496c6d0fa5bf86227903 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -0,0 +1,239 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h" +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +namespace { + +void UnlinkNodes(ir::Node* a, ir::Node* b) { + a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b), + a->outputs.end()); + b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a), + b->inputs.end()); +} + +} // namespace + +enum { U8_MAX = 255, S8_MAX = 127 }; + +using EigenVectorArrayMap = Eigen::Map>; +using string::PrettyLogDetail; + +void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input, + std::string input_name, double scale_to_one, + bool is_unsigned, + std::string scale_attr_name) const { + unsigned max = is_unsigned ? U8_MAX : S8_MAX; + float scale = scale_to_one * max; + + // Create quantize output variable + VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out")); + auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc); + + // create a quantize op node + OpDesc q_desc; + q_desc.SetType("quantize"); + q_desc.SetInput("Input", std::vector({input->Name()})); + q_desc.SetOutput("Output", + std::vector({quantize_out_node->Name()})); + q_desc.SetAttr("Scale", scale); + q_desc.SetAttr("is_negative_input", !is_unsigned); + auto quantize_op = g->CreateOpNode(&q_desc); // OpDesc will be copied. + + // update op's input + op->Op()->SetInput(input_name, + std::vector({quantize_out_node->Name()})); + + // link quantize op + UnlinkNodes(input, op); + IR_NODE_LINK_TO(input, quantize_op); + IR_NODE_LINK_TO(quantize_op, quantize_out_node); + IR_NODE_LINK_TO(quantize_out_node, op); + + if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale); +} + +void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output, + std::string output_name, + double scale_to_one, bool is_unsigned, + std::string scale_attr_name) const { + unsigned max = is_unsigned ? U8_MAX : S8_MAX; + float scale = scale_to_one * max; + + // Create dequantize input variable + VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in")); + auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc); + + // create a dequantize op node for output. + OpDesc deq_desc; + deq_desc.SetType("dequantize"); + deq_desc.SetInput("Input", + std::vector({dequantize_in_node->Name()})); + deq_desc.SetOutput("Output", std::vector({output->Name()})); + deq_desc.SetAttr("Scale", scale); + auto dequantize_op = g->CreateOpNode(&deq_desc); // OpDesc will be copied. + + // update op's output + op->Op()->SetOutput(output_name, + std::vector({dequantize_in_node->Name()})); + + // link dequantize op + UnlinkNodes(op, output); + IR_NODE_LINK_TO(op, dequantize_in_node); + IR_NODE_LINK_TO(dequantize_in_node, dequantize_op); + IR_NODE_LINK_TO(dequantize_op, output); + + if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale); +} + +void CPUQuantizePass::QuantizeConv(Graph* graph, + bool with_residual_data) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + patterns::ConvResidual conv_pattern{pattern, name_scope_}; + conv_pattern(with_residual_data); + + int quantize_conv_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "Quantize conv2d op"; + GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); + auto* conv_op_desc = conv_op->Op(); + + // skip if should not be quantized + if (!conv_op_desc->HasAttr("use_quantizer") || + !boost::get(conv_op_desc->GetAttr("use_quantizer"))) + return; + + GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); + + // get scales calculated after warmup, they scale variables to MAX=1.0 + auto scales = Get("quant_var_scales"); + + auto input_scale = scales[conv_input->Name()].second.data()[0]; + bool is_input_unsigned = scales[conv_input->Name()].first; + QuantizeInput(g, conv_op, conv_input, "Input", input_scale, + is_input_unsigned, "Scale_in"); + + auto filter_scale_tensor = scales[conv_filter->Name()].second; + EigenVectorArrayMap eigen_tensor{filter_scale_tensor.data(), + filter_scale_tensor.numel(), 1}; + eigen_tensor *= static_cast(S8_MAX); + std::vector filter_scale{ + filter_scale_tensor.data(), + filter_scale_tensor.data() + filter_scale_tensor.numel()}; + + conv_op->Op()->SetAttr("Scale_weights", filter_scale); + + if (with_residual_data) { + GET_IR_NODE_FROM_SUBGRAPH(conv_residual_data, conv_residual_data, + conv_pattern); + auto residual_scale = + scales[conv_residual_data->Name()].second.data()[0]; + bool is_residual_unsigned = scales[conv_residual_data->Name()].first; + + QuantizeInput(g, conv_op, conv_residual_data, "ResidualData", + residual_scale, is_residual_unsigned, "Scale_in_eltwise"); + } + + auto output_scale = scales[conv_output->Name()].second.data()[0]; + bool is_output_unsigned = scales[conv_output->Name()].first; + DequantizeOutput(g, conv_op, conv_output, "Output", output_scale, + is_output_unsigned, "Scale_out"); + + ++quantize_conv_count; + }; + + gpd(graph, handler); + AddStatis(quantize_conv_count); + + std::stringstream msg_ss; + msg_ss << "--- quantized " << quantize_conv_count << " conv2d ops"; + if (with_residual_data) msg_ss << " with residual connection"; + PrettyLogDetail(msg_ss.str().c_str()); +} + +void CPUQuantizePass::QuantizePool(Graph* graph) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + patterns::Pool pool_pattern{pattern, name_scope_}; + pool_pattern(); + + int quantize_pool_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "Quantize pool2d op"; + GET_IR_NODE_FROM_SUBGRAPH(pool_op, pool_op, pool_pattern); + auto* pool_op_desc = pool_op->Op(); + + // skip if should not be quantized + if (!pool_op_desc->HasAttr("use_quantizer") || + !boost::get(pool_op_desc->GetAttr("use_quantizer"))) + return; + + GET_IR_NODE_FROM_SUBGRAPH(pool_input, pool_input, pool_pattern); + GET_IR_NODE_FROM_SUBGRAPH(pool_output, pool_output, pool_pattern); + + // get scales calculated after warmup, they scale variables to MAX=1.0 + auto scales = Get("quant_var_scales"); + + auto input_scale = scales[pool_input->Name()].second.data()[0]; + bool is_input_unsigned = scales[pool_input->Name()].first; + QuantizeInput(g, pool_op, pool_input, "X", input_scale, is_input_unsigned); + + auto output_scale = scales[pool_output->Name()].second.data()[0]; + bool is_output_unsigned = scales[pool_output->Name()].first; + DequantizeOutput(g, pool_op, pool_output, "Out", output_scale, + is_output_unsigned); + + ++quantize_pool_count; + }; + + gpd(graph, handler); + AddStatis(quantize_pool_count); + + PrettyLogDetail("--- quantized %d pool2d ops", quantize_pool_count); +} + +std::unique_ptr CPUQuantizePass::ApplyImpl( + std::unique_ptr graph) const { + VLOG(3) << "Quantizing the graph."; + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init(name_scope_, graph.get()); + + PADDLE_ENFORCE(param_scope()); + + QuantizeConv(graph.get(), false /* with_residual_data */); + QuantizeConv(graph.get(), true /* with_residual_data */); + QuantizePool(graph.get()); + + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(cpu_quantize_pass, paddle::framework::ir::CPUQuantizePass) + .RequirePassAttr("quant_var_scales"); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..9873bb04e138a745ac6aa44cf5791651ad897444 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -0,0 +1,66 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Map variable name to tensor of scaling factors scaling it to MAX=1.0. + * bool denotes whether quantization of the variable should be done to unsigned + * type. + */ +using VarQuantScale = + std::unordered_map>; + +/* + * Quantize all supported operators. + */ +class CPUQuantizePass : public FusePassBase { + public: + virtual ~CPUQuantizePass() {} + + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; + + void QuantizeConv(Graph* graph, bool with_residual_data = false) const; + + void QuantizePool(Graph* graph) const; + + void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name, + double scale_to_one, bool is_unsigned, + std::string scale_attr_name = "") const; + + void DequantizeOutput(Graph* g, Node* op, Node* output, + std::string output_name, double scale_to_one, + bool is_unsigned, + std::string scale_attr_name = "") const; + + const std::string name_scope_{"quantize"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..0d0ed989012fced7f639c2bc12a3bafa6edf27f6 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -0,0 +1,211 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h" +#include +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs, bool use_mkldnn, + bool use_quantizer = false) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("name", name); + if (type == "conv2d") { + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + if (inputs.size() > 2) + op->SetInput("Bias", {inputs[2]}); + else + op->SetInput("Bias", {}); + if (inputs.size() > 3) { + op->SetInput("ResidualData", {inputs[3]}); + op->SetAttr("fuse_residual_connection", true); + } else { + op->SetInput("ResidualData", {}); + op->SetAttr("fuse_residual_connection", false); + } + op->SetOutput("Output", {outputs[0]}); + op->SetAttr("use_quantizer", use_quantizer); + op->SetAttr("Scale_in", 1.0f); + op->SetAttr("Scale_out", 1.0f); + op->SetAttr("Scale_weights", std::vector{1.0f}); + } else if (type == "pool2d") { + op->SetInput("X", {inputs[0]}); + op->SetOutput("Out", {outputs[0]}); + op->SetAttr("use_quantizer", use_quantizer); + } else if (type == "dropout") { + op->SetInput("X", {inputs[0]}); + op->SetOutput("Out", {outputs[0]}); + } else if (type == "fc") { + op->SetInput("Input", {inputs[0]}); + if (inputs.size() > 1) op->SetInput("W", {inputs[1]}); + if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]}); + op->SetOutput("Out", {outputs[0]}); + } +} + +static const std::initializer_list variable_names{ + "a", "w1", "c", "d", "w2", "e", "f", "g", + "h", "w3", "b1", "i", "j", "w4", "b2"}; +// (a,w1)->Conv1->c and c->Pool1->d +// +// (d,w2)->Conv2->e and e->Pool2->f +// +// d->Dropout1->g and g->Fc1->h and (h,w3,b1,i)->Conv3->j +// +// (d,w4, b2)->Conv4->i +ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) { + ProgramDesc prog; + for (auto& v : variable_names) { + auto* var = prog.MutableBlock(0)->Var(v); + if (v.find("w") == 0 || v.find("b") == 0) { + var->SetPersistable(true); + } + } + + SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"c"}, use_mkldnn, + use_quantizer); + SetOp(&prog, "pool2d", "Pool1", {"c"}, {"d"}, use_mkldnn, use_quantizer); + + SetOp(&prog, "conv2d", "Conv2", {"d", "w2"}, {"e"}, use_mkldnn, + use_quantizer); + SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, use_quantizer); + + SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn); + SetOp(&prog, "fc", "Fc1", {"g"}, {"h"}, use_mkldnn); + SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn, + use_quantizer); + + SetOp(&prog, "conv2d", "Conv4", {"c", "w4", "b2"}, {"i"}, use_mkldnn, + use_quantizer); + + return prog; +} + +void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, + const char* var_name) { + auto x = scope->Var(var_name); + auto tensor = x->GetMutable(); + tensor->mutable_data(place, proto::VarType::FP32, + ::paddle::memory::Allocator::kDefault, 1); +} + +void MainTest(const ProgramDesc& prog, int conv_count, int pool_count, + int quant_count, int dequant_count, int added_nodes_count, + float scale) { + std::unique_ptr graph(new ir::Graph(prog)); + + // Init scope, as it is used in pass + auto place = paddle::platform::CPUPlace(); + NaiveExecutor exe{place}; + Scope scope; + exe.CreateVariables(prog, 0, true, &scope); + + auto* scales = new VarQuantScale(); + + for (auto& v : variable_names) { + InitTensorHolder(&scope, place, v.c_str()); + LoDTensor tensor; + tensor.Resize({1}); + auto* ptr = tensor.mutable_data(place); + ptr[0] = 2.0; + + (*scales)[v] = std::make_pair(false, std::move(tensor)); + } + + graph->Set(kParamScopeAttr, new framework::Scope*(&scope)); + + auto pass = PassRegistry::Instance().Get("cpu_quantize_pass"); + pass->Set("quant_var_scales", scales); + + int original_nodes_num = graph->Nodes().size(); + + graph = pass->Apply(std::move(graph)); + + int current_nodes_num = graph->Nodes().size(); + + int quantize_nodes_count = 0; + int dequantize_nodes_count = 0; + int conv2d_nodes_count = 0; + int pool2d_nodes_count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->Type() == "conv2d") { + conv2d_nodes_count++; + auto op_name = boost::get(op->GetAttr("name")); + EXPECT_EQ(boost::get(op->GetAttr("Scale_in")), scale) + << "Scale_in for node '" + op_name + "'."; + EXPECT_EQ(boost::get(op->GetAttr("Scale_out")), scale) + << "Scale_out for node '" + op_name + "'."; + EXPECT_EQ( + boost::get>(op->GetAttr("Scale_weights"))[0], + scale) + << "Scale_weights for node '" + op_name + "'."; + } else if (op->Type() == "pool2d") { + pool2d_nodes_count++; + } else if (op->Type() == "quantize") { + quantize_nodes_count++; + } else if (op->Type() == "dequantize") { + dequantize_nodes_count++; + } + } + } + EXPECT_EQ(conv2d_nodes_count, conv_count); + EXPECT_EQ(pool2d_nodes_count, pool_count); + EXPECT_EQ(quantize_nodes_count, quant_count); + EXPECT_EQ(dequantize_nodes_count, dequant_count); + EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num); +} + +TEST(CpuQuantizePass, quantize) { + bool use_mkldnn = true; + bool use_quantizer = true; + // (a->QUANT1->IN1,w1)->Conv1->OUT1->DEQUANT1->c and + // c->QUANT2->IN2->Pool1->OUT2->DEQUANT2->d + // + // (d->QUANT3->IN3,w2)->Conv2->OUT3->DEQUANT3->e and + // e->QUANT4->IN4->Pool2->OUT4->DEQUANT4->f + // + // d->Dropout1->g and g->Fc1->h and + // (h->QUANT5->IN5,w3,b1,i->QUANT6->IN6)->Conv3->OUT5->DEQUANT5->j + // + // (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i + // Insert nodes: 7 Quant + 7 IN + 6 OUT + 6 DEQUANT + int added_nodes = 7 + 7 + 6 + 6; + MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 7, 6, added_nodes, + 2.0f * 127); +} + +TEST(CpuQuantizePass, do_not_quantize) { + bool use_mkldnn = true; + bool use_quantizer = false; + int added_nodes = 0; + MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 0, 0, added_nodes, + 1.0f); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(cpu_quantize_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..511003dce59f91272802766544577e9c473a3a1d --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h" +#include +#include + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr CPUQuantizePlacementPass::ApplyImpl( + std::unique_ptr graph) const { + VLOG(3) << "Marks operators which are to be quantized."; + const auto& excluded_ids_list = + Get>("quantize_excluded_op_ids"); + const auto& op_types_list = + Get>("quantize_enabled_op_types"); + for (const Node* n : graph->Nodes()) { + if (n->IsOp()) { + if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(), + n->id()) != excluded_ids_list.end()) + continue; + auto* op = n->Op(); + if (op->HasAttr("use_quantizer") || op->HasProtoAttr("use_quantizer")) { + if (op_types_list.empty()) { + op->SetAttr("use_quantizer", true); + } else if (std::find(op_types_list.begin(), op_types_list.end(), + n->Name()) != op_types_list.end()) { + op->SetAttr("use_quantizer", true); + } + } + } + } + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(cpu_quantize_placement_pass, + paddle::framework::ir::CPUQuantizePlacementPass) + // a vector of operator type names to be quantized ("conv2d" etc.) + .RequirePassAttr("quantize_enabled_op_types") + // a vector of operator ids that are to be excluded from quantization + .RequirePassAttr("quantize_excluded_op_ids"); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..ef3861b2493b5b82620f8bec6e808c0c921a9680 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { +/* + * Specifies which operators should be quantized. + */ +class CPUQuantizePlacementPass : public Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..11d72a56bd66792ff3ed5cc8184f5b242d9cdba5 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc @@ -0,0 +1,129 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h" + +#include +#include + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs, + boost::tribool use_quantizer) { + auto* op = prog->MutableBlock(0)->AppendOp(); + + op->SetType(type); + + if (!boost::indeterminate(use_quantizer)) + op->SetAttr("use_quantizer", use_quantizer); + + if (type == "conv2d") { + op->SetAttr("name", name); + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + op->SetInput("Bias", {inputs[2]}); + } else if (type == "relu") { + op->SetInput("X", inputs); + } else if (type == "concat") { + op->SetAttr("axis", 1); + op->SetInput("X", {inputs[0], inputs[1]}); + } else if (type == "pool2d") { + op->SetInput("X", {inputs[0]}); + } else { + FAIL() << "Unexpected operator type."; + } + op->SetOutput("Out", {outputs[0]}); +} + +// operator use_quantizer +// --------------------------------------- +// (a,b)->concat->c none +// (c,weights,bias)->conv->f false +// f->relu->g none +// g->pool->h false +// (h,weights2,bias2)->conv->k false +// k->pool->l false +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + + for (auto& v : + std::vector({"a", "b", "c", "weights", "bias", "f", "g", + "h", "weights2", "bias2", "k", "l"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::SELECTED_ROWS); + if (v == "weights" || v == "bias") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"}, boost::indeterminate); + SetOp(&prog, "conv2d", "conv1", {"c", "weights", "bias"}, {"f"}, false); + SetOp(&prog, "relu", "relu1", {"f"}, {"g"}, boost::indeterminate); + SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"}, false); + SetOp(&prog, "conv2d", "conv2", {"h", "weights2", "bias2"}, {"k"}, false); + SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"}, false); + + return prog; +} + +void MainTest(std::initializer_list quantize_enabled_op_types, + std::initializer_list quantize_excluded_op_ids, + unsigned expected_use_quantizer_true_count) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("cpu_quantize_placement_pass"); + pass->Set("quantize_enabled_op_types", + new std::unordered_set(quantize_enabled_op_types)); + pass->Set("quantize_excluded_op_ids", + new std::unordered_set(quantize_excluded_op_ids)); + + graph = pass->Apply(std::move(graph)); + + unsigned use_quantizer_true_count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->HasAttr("use_quantizer") && + boost::get(op->GetAttr("use_quantizer"))) { + ++use_quantizer_true_count; + } + } + } + + EXPECT_EQ(use_quantizer_true_count, expected_use_quantizer_true_count); +} + +TEST(QuantizerPlacementPass, enabled_pool) { MainTest({"pool2d"}, {}, 2); } + +TEST(QuantizerPlacementPass, enabled_conv_excluded_one) { + MainTest({"conv2d"}, {4}, 1); +} + +TEST(QuantizerPlacementPass, excluded_none) { + // 2 conv + 2 pool + MainTest({}, {}, 4); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(cpu_quantize_placement_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..6e74cc7787b73d06b1093ed4e846ab83b1234803 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc @@ -0,0 +1,146 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file eint8_outcept in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either eint8_outpress or +// implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h" +#include +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +using string::PrettyLogDetail; + +void CPUQuantizeSquashPass::FindNodesToKeep( + Graph* graph, + std::unordered_map* nodes_keep_counter) const { + GraphPatternDetector gpd; + patterns::DequantAny deq_any_pattern{gpd.mutable_pattern(), "deqant_any"}; + deq_any_pattern(); + + int found_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, deq_any_pattern); + + if (nodes_keep_counter->find(dequant_out) == nodes_keep_counter->end()) + (*nodes_keep_counter)[dequant_out] = 1; + else + (*nodes_keep_counter)[dequant_out] += 1; + + found_count++; + }; + gpd(graph, handler); + AddStatis(found_count); +} + +void CPUQuantizeSquashPass::Squash( + Graph* graph, + std::unordered_map* nodes_keep_counter) const { + GraphPatternDetector gpd; + patterns::DequantQuantAny squash_pattern{gpd.mutable_pattern(), "squash"}; + squash_pattern(); + + int found_squash_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "squash requantize-quantize ops pair"; + + GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(quant_op, quant_op, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, squash_pattern); + + auto* next_op_desc = next_op->Op(); + float dequant_scale = boost::get(dequant_op->Op()->GetAttr("Scale")); + float quant_scale = boost::get(quant_op->Op()->GetAttr("Scale")); + PADDLE_ENFORCE(nodes_keep_counter->find(dequant_out) != + nodes_keep_counter->end()); + + // check if dequantize op should be kept or removed, decrease the counter + bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1; + + if (dequant_scale == quant_scale) { + // squash dequantize-quantize to nothing + auto quant_out_var_name = quant_out->Name(); + auto next_op_inputs = next_op_desc->InputNames(); + for (const auto& name : next_op_inputs) { + auto var_name = next_op_desc->Input(name)[0]; + if (var_name.compare(quant_out_var_name) == 0) { + next_op_desc->SetInput( + name, std::vector({dequant_in->Name()})); + break; + } + } + + if (keep_dequant) + GraphSafeRemoveNodes(graph, {quant_op, quant_out}); + else + GraphSafeRemoveNodes(graph, + {dequant_op, quant_op, dequant_out, quant_out}); + + IR_NODE_LINK_TO(dequant_in, next_op); + + found_squash_count++; + } else { + // squash dequantize-quantize to requantize op + OpDesc desc; + desc.SetType("requantize"); + desc.SetInput("Input", std::vector({dequant_in->Name()})); + desc.SetOutput("Output", std::vector({quant_out->Name()})); + desc.SetAttr("Scale_in", dequant_scale); + desc.SetAttr("Scale_out", quant_scale); + + auto requant_op = g->CreateOpNode(&desc); + + if (keep_dequant) + GraphSafeRemoveNodes(graph, {quant_op}); + else + GraphSafeRemoveNodes(graph, {dequant_op, quant_op, dequant_out}); + + IR_NODE_LINK_TO(dequant_in, requant_op); + IR_NODE_LINK_TO(requant_op, quant_out); + + found_squash_count++; + } + }; + gpd(graph, handler); + AddStatis(found_squash_count); + PrettyLogDetail("--- squashed %d dequantize-quantize pairs", + found_squash_count); +} + +std::unique_ptr CPUQuantizeSquashPass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init("cpu_quantize_squash_pass", graph.get()); + + std::unordered_map nodes_keep_counter; + FindNodesToKeep(graph.get(), &nodes_keep_counter); + Squash(graph.get(), &nodes_keep_counter); + + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(cpu_quantize_squash_pass, + paddle::framework::ir::CPUQuantizeSquashPass); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..b823a2cef35b2f9994df9c9473246db3d69843e7 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h @@ -0,0 +1,58 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Squash dequantize->quantize pair pattern into requantize op + */ +class CPUQuantizeSquashPass : public FusePassBase { + public: + virtual ~CPUQuantizeSquashPass() {} + + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; + + /* + * For each dequantize's output find the number of operators it is an input to + */ + void FindNodesToKeep( + Graph* graph, + std::unordered_map* nodes_keep_counter) const; + + /* + * Squash dequantize-quantize ops pairs into requantize or nothing + */ + void Squash(Graph* graph, + std::unordered_map* nodes_keep_counter) const; + + const std::string name_scope_{"squash"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..3cf51d97aa4b8be468b8c2a78dd17aafbbf0e15b --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc @@ -0,0 +1,179 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h" +#include +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs, bool use_mkldnn, + float scale = 0) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("name", name); + if (type == "conv2d") { + op->SetInput("Input", {inputs[0]}); + if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]}); + if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]}); + op->SetOutput("Output", {outputs[0]}); + } else if (type == "quantize") { + op->SetInput("Input", {inputs[0]}); + op->SetOutput("Output", {outputs[0]}); + op->SetAttr("Scale", scale); + } else if (type == "dequantize") { + op->SetInput("Input", {inputs[0]}); + op->SetOutput("Output", {outputs[0]}); + op->SetAttr("Scale", scale); + } +} + +// (a,w1,b1)->Conv1->d +// d->Dequant->e +// e->Quant->f +// (f,w2,b2)->Conv2->i +ProgramDesc BuildProgramDesc(bool use_mkldnn, float scale1, float scale2) { + ProgramDesc prog; + for (auto& v : std::initializer_list( + {"a", "w1", "b1", "d", "e", "f", "w2", "b2", "i"})) { + auto* var = prog.MutableBlock(0)->Var(v); + if (v.find("w") == 0 || v.find("b") == 0) { + var->SetPersistable(true); + } + } + + SetOp(&prog, "conv2d", "Conv1", {"a", "w1", "b1"}, {"d"}, use_mkldnn); + SetOp(&prog, "dequantize", "Dequant", {"d"}, {"e"}, use_mkldnn, scale1); + SetOp(&prog, "quantize", "Quant", {"e"}, {"f"}, use_mkldnn, scale2); + SetOp(&prog, "conv2d", "Conv2", {"f", "w2", "b2"}, {"i"}, use_mkldnn); + return prog; +} + +static const std::initializer_list variable_names{ + "a", "b", "c", "d", "e", "f", "g", "h"}; +// a->Conv1->b +// b->Dequant->c +// +// c->Quant1->d and d->Conv2->e +// +// c->Conv3->f +// +// c->Quant2->g and g->Conv4->h +// +ProgramDesc BuildProgramDesc2(bool use_mkldnn, float scale1, float scale2, + float scale3) { + ProgramDesc prog; + for (auto& v : variable_names) { + prog.MutableBlock(0)->Var(v); + } + + SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn); + SetOp(&prog, "dequantize", "Dequant", {"b"}, {"c"}, use_mkldnn, scale1); + + SetOp(&prog, "quantize", "Quant1", {"c"}, {"d"}, use_mkldnn, scale2); + SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn); + + SetOp(&prog, "conv2d", "Conv3", {"c"}, {"f"}, use_mkldnn); + + SetOp(&prog, "quantize", "Quant2", {"c"}, {"g"}, use_mkldnn, scale3); + SetOp(&prog, "conv2d", "Conv4", {"g"}, {"h"}, use_mkldnn); + + return prog; +} + +void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, + const char* var_name) { + auto x = scope->Var(var_name); + auto tensor = x->GetMutable(); + tensor->mutable_data(place, proto::VarType::FP32, + ::paddle::memory::Allocator::kDefault, 1); +} + +void MainTest(const ProgramDesc& prog, int removed_nodes_num) { + std::unique_ptr graph(new ir::Graph(prog)); + + // Init scope, as it is used in pass + auto place = paddle::platform::CPUPlace(); + NaiveExecutor exe{place}; + Scope scope; + exe.CreateVariables(prog, 0, true, &scope); + + for (auto& v : variable_names) { + InitTensorHolder(&scope, place, v.c_str()); + } + + graph->Set(kParamScopeAttr, new framework::Scope*(&scope)); + + auto pass = PassRegistry::Instance().Get("cpu_quantize_squash_pass"); + + int original_nodes_num = graph->Nodes().size(); + + graph = pass->Apply(std::move(graph)); + + int current_nodes_num = graph->Nodes().size(); + + EXPECT_EQ(original_nodes_num - removed_nodes_num, current_nodes_num); +} + +TEST(CpuQuantizeSquashPass, equal_scales) { + auto scale = 1.2345f; + auto use_mkldnn = true; + // Remove 4 nodes: Dequant, Quant, e, f + auto remove_nodes = 4; + MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes); + + use_mkldnn = !use_mkldnn; + MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes); +} + +TEST(CpuQuantizeSquashPass, inequal_scales) { + auto scale1 = 1.2345f; + auto scale2 = 21.0f; + auto use_mkldnn = true; + // Remove 3 nodes: Dequant, Quant, e + // Insert 1 node: requantize + auto remove_nodes = 2; + MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes); + + use_mkldnn = !use_mkldnn; + MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes); +} + +TEST(CpuQuantizeSquashPass, branch_to_equal_inequal_and_fp32) { + // Delete both quantize ops, + // bypass dequantize in both branches, + // insert requantize on one branch + auto scale = 1.2345f; + auto scale2 = 21.0f; + auto use_mkldnn = true; + // Remove 3 nodes: Quant1, Quant2, g + // Insert 1 node: requantize + auto remove_nodes = 2; + MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes); + + use_mkldnn = !use_mkldnn; + MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(cpu_quantize_squash_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h index 3d4dc9e2b6ecccddea4d63e45710c80d55ef2772..c071d9aed20bd40f5c1076d2dc5d3098a4e65495 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h @@ -14,12 +14,16 @@ limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { +/* + * Specifies which operators should use MKLDNN. + */ class MKLDNNPlacementPass : public Pass { protected: std::unique_ptr ApplyImpl( diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 9eade9eaa8f00fe6e76063344f47968f4e97cf7f..72fb876d98dc84164398583baf22c49014af483a 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include #include diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..67b29512c4cf3512e4b2b4b5a18ba60a3d9120dc --- /dev/null +++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc @@ -0,0 +1,39 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/runtime_context_cache_pass.h" +#include +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr RuntimeContextCachePass::ApplyImpl( + std::unique_ptr graph) const { + VLOG(3) << "Applies Runtime Context Cache strategy."; + for (const Node* n : graph->Nodes()) { + if (n->IsOp()) { + n->Op()->SetAttr(kEnableCacheRuntimeContext, true); + } + } + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(runtime_context_cache_pass, + paddle::framework::ir::RuntimeContextCachePass); diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.h b/paddle/fluid/framework/ir/runtime_context_cache_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..a6cf1a9ae5035f185dd3ab52bf0762a6eaf0f6e5 --- /dev/null +++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class RuntimeContextCachePass : public Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..84fb8063e6f020d5ada2c6af7a0307360aa1c92c --- /dev/null +++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc @@ -0,0 +1,244 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +template +std::unique_ptr SimplifyAnakinDetectionPatternPass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = + "simplify_anakin_detection_pattern_pass" + std::to_string(times); + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + std::vector input_nodes; + for (int i = 0; i < times; i++) { + input_nodes.push_back(gpd.mutable_pattern() + ->NewNode("x" + std::to_string(i)) + ->assert_is_op_input("density_prior_box", "Input") + ->AsInput()); + } + input_nodes.push_back(gpd.mutable_pattern() + ->NewNode("x" + std::to_string(times)) + ->assert_is_op_input("box_coder", "TargetBox") + ->AsInput()); + + input_nodes.push_back(gpd.mutable_pattern() + ->NewNode("x" + std::to_string(times + 1)) + ->assert_is_op_input("transpose2") + ->AsInput()); + + patterns::AnakinDetectionPattern pattern(gpd.mutable_pattern(), pattern_name); + pattern(input_nodes, times); + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + const int kNumFields = 7; + const int kPriorBoxLocOffset = 1; + const int kReshape1Offset = 2; + const int kReshape1OutOffset = 3; + const int kPriorBoxVarOffset = 4; + const int kReshape2Offset = 5; + const int kReshape2OutOffset = 6; + std::vector nodes; + + for (int i = 0; i < times; i++) { + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i)))); + + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i)))); + + nodes.push_back( + subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i)))); + } + + Node *concat_op1 = subgraph.at(pattern.GetPDNode("concat1")); + Node *concat_out1 = subgraph.at(pattern.GetPDNode("concat1_out")); + + Node *concat_op2 = subgraph.at(pattern.GetPDNode("concat2")); + Node *concat_out2 = subgraph.at(pattern.GetPDNode("concat2_out")); + + Node *box_coder_third_input = subgraph.at(input_nodes[times]); + Node *box_coder_op = subgraph.at(pattern.GetPDNode("box_coder")); + Node *box_coder_out = subgraph.at(pattern.GetPDNode("box_coder_out")); + + Node *multiclass_nms_second_input = subgraph.at(input_nodes[times + 1]); + Node *transpose_before_nms = + subgraph.at(pattern.GetPDNode("transpose_before_nms")); + Node *transpose_before_nms_out = + subgraph.at(pattern.GetPDNode("transpose_before_nms_out")); + + Node *multiclass_nms = subgraph.at(pattern.GetPDNode("multiclass_nms")); + Node *multiclass_nms_out = + subgraph.at(pattern.GetPDNode("multiclass_nms_out")); + + std::string code_type = + boost::get(box_coder_op->Op()->GetAttr("code_type")); + bool box_normalized = + boost::get(box_coder_op->Op()->GetAttr("box_normalized")); + // auto variance = + // boost::get>(box_coder_op->Op()->GetAttr("variance")); + int background_label = + boost::get(multiclass_nms->Op()->GetAttr("background_label")); + float score_threshold = + boost::get(multiclass_nms->Op()->GetAttr("score_threshold")); + int nms_top_k = boost::get(multiclass_nms->Op()->GetAttr("nms_top_k")); + float nms_threshold = + boost::get(multiclass_nms->Op()->GetAttr("nms_threshold")); + float nms_eta = boost::get(multiclass_nms->Op()->GetAttr("nms_eta")); + int keep_top_k = + boost::get(multiclass_nms->Op()->GetAttr("keep_top_k")); + + std::vector concat1_input_names; + for (int i = 0; i < times; i++) { + concat1_input_names.push_back( + nodes[i * kNumFields + kPriorBoxLocOffset]->Name()); + } + + // int axis = boost::get(concat_op1->Op()->GetAttr("axis")); + framework::OpDesc concat1_desc; + concat1_desc.SetType("concat"); + concat1_desc.SetInput("X", concat1_input_names); + concat1_desc.SetAttr("axis", 2); + concat1_desc.SetOutput("Out", {concat_out1->Name()}); + + auto *new_add_concat_op = graph->CreateOpNode(&concat1_desc); + + for (int i = 0; i < times; i++) { + nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back( + new_add_concat_op); + new_add_concat_op->inputs.push_back( + nodes[i * kNumFields + kPriorBoxLocOffset]); + } + + framework::OpDesc new_op_desc; + new_op_desc.SetType("detection_out"); + new_op_desc.SetInput("PriorBox", {concat_out1->Name()}); + new_op_desc.SetInput("TargetBox", {box_coder_third_input->Name()}); + new_op_desc.SetInput("Scores", {multiclass_nms_second_input->Name()}); + new_op_desc.SetAttr("code_type", code_type); + new_op_desc.SetAttr("box_normalized", box_normalized); + new_op_desc.SetAttr("background_label", background_label); + new_op_desc.SetAttr("score_threshold", score_threshold); + new_op_desc.SetAttr("nms_top_k", nms_top_k); + new_op_desc.SetAttr("nms_threshold", nms_threshold); + new_op_desc.SetAttr("nms_eta", nms_eta); + new_op_desc.SetAttr("keep_top_k", keep_top_k); + new_op_desc.SetOutput("Out", {multiclass_nms_out->Name()}); + new_op_desc.Flush(); + + // Create a new node for the fused op. + auto *detection_out_op = graph->CreateOpNode(&new_op_desc); + + std::unordered_set delete_nodes; + + for (int i = 0; i < times; i++) { + nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back(concat_op1); + delete_nodes.insert(nodes[i * kNumFields + kReshape1Offset]); + delete_nodes.insert(nodes[i * kNumFields + kReshape1OutOffset]); + delete_nodes.insert(nodes[i * kNumFields + kPriorBoxVarOffset]); + delete_nodes.insert(nodes[i * kNumFields + kReshape2Offset]); + delete_nodes.insert(nodes[i * kNumFields + kReshape2OutOffset]); + } + + delete_nodes.insert(concat_op1); + delete_nodes.insert(concat_op2); + delete_nodes.insert(concat_out2); + delete_nodes.insert(box_coder_op); + delete_nodes.insert(box_coder_out); + delete_nodes.insert(transpose_before_nms); + delete_nodes.insert(transpose_before_nms_out); + delete_nodes.insert(multiclass_nms); + + new_add_concat_op->outputs.push_back(concat_out1); + concat_out1->inputs.push_back(new_add_concat_op); + + detection_out_op->inputs.push_back(concat_out1); + detection_out_op->inputs.push_back(box_coder_third_input); + detection_out_op->inputs.push_back(multiclass_nms_second_input); + detection_out_op->outputs.push_back(multiclass_nms_out); + + concat_out1->outputs.push_back(detection_out_op); + box_coder_third_input->outputs.push_back(detection_out_op); + multiclass_nms_second_input->outputs.push_back(detection_out_op); + multiclass_nms_out->inputs.push_back(detection_out_op); + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), delete_nodes); + }; + + gpd(graph.get(), handler); + return graph; +} + +template class SimplifyAnakinDetectionPatternPass<1>; +template class SimplifyAnakinDetectionPatternPass<2>; +template class SimplifyAnakinDetectionPatternPass<3>; +template class SimplifyAnakinDetectionPatternPass<4>; +template class SimplifyAnakinDetectionPatternPass<5>; +template class SimplifyAnakinDetectionPatternPass<6>; + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(simplify_anakin_detection_pattern_pass, + paddle::framework::ir::SimplifyAnakinDetectionPatternPass<1>); + +REGISTER_PASS(simplify_anakin_detection_pattern_pass2, + paddle::framework::ir::SimplifyAnakinDetectionPatternPass<2>); + +REGISTER_PASS(simplify_anakin_detection_pattern_pass3, + paddle::framework::ir::SimplifyAnakinDetectionPatternPass<3>); + +REGISTER_PASS(simplify_anakin_detection_pattern_pass4, + paddle::framework::ir::SimplifyAnakinDetectionPatternPass<4>); + +REGISTER_PASS(simplify_anakin_detection_pattern_pass5, + paddle::framework::ir::SimplifyAnakinDetectionPatternPass<5>); + +REGISTER_PASS(simplify_anakin_detection_pattern_pass6, + paddle::framework::ir::SimplifyAnakinDetectionPatternPass<6>); diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..2338e4c38b253e2110addcca494e1cae5b58beaf --- /dev/null +++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h @@ -0,0 +1,41 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +// There may be many transpose-flatten structures in a model, and the output of +// these structures will be used as inputs to the concat Op. This pattern will +// be detected by our pass. The times here represents the repeat times of this +// structure. +template +class SimplifyAnakinDetectionPatternPass : public FusePassBase { + public: + virtual ~SimplifyAnakinDetectionPatternPass() {} + + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..b37003991505140b0d531a4ea2b481c6d4b09d75 --- /dev/null +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/sync_batch_norm_pass.h" +#include +#include +#include + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr SyncBatchNormPass::ApplyImpl( + std::unique_ptr graph) const { + VLOG(3) << "Use synchronous batch norm"; + for (const Node* n : graph->Nodes()) { + if (n->IsOp()) { + auto* op = n->Op(); + if (op->Type() == "batch_norm") { + op->SetType("sync_batch_norm"); + } + if (op->Type() == "batch_norm_grad") { + op->SetType("sync_batch_norm_grad"); + } + } + } + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(sync_batch_norm_pass, paddle::framework::ir::SyncBatchNormPass); diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.h b/paddle/fluid/framework/ir/sync_batch_norm_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..51cce3dca69330071f7d12efef08e2006e8bd7ac --- /dev/null +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class SyncBatchNormPass : public Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..9c94c1746a6590df5a43c099b9c4c3678ca6e393 --- /dev/null +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/sync_batch_norm_pass.h" +#include + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetAttr("name", name); + op->SetInput("X", inputs); + op->SetOutput("Out", outputs); +} + +// (a, conv_w)->conv2d->b +// (b, bn_scale, bn_bias, mean, var)->batch_norm +// ->(c, mean, var, save_mean, save_inv_var) +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + for (auto& v : std::vector({"a", "conv_w", "b", "bn_scale", + "bn_bias", "mean", "var", "c", + "save_mean", "save_inv_var"})) { + auto* var = prog.MutableBlock(0)->Var(v); + if (v == "conv_w" || v == "bn_scale" || v == "bn_bias" || v == "mean" || + v == "var") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "conv2d", "conv", std::vector({"a", "conv_w"}), + std::vector({"b"})); + SetOp(&prog, "batch_norm", "bn", + std::vector({"b", "bn_scale", "bn_bias", "mean", "var"}), + std::vector( + {"c", "mean", "var", "save_mean", "save_inv_var"})); + return prog; +} + +TEST(IsTestPass, basic) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("sync_batch_norm_pass"); + + graph = pass->Apply(std::move(graph)); + + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + auto op_name = boost::get(op->GetAttr("name")); + if (op_name == "bn") { + ASSERT_EQ(op->Type(), "sync_batch_norm"); + } + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(sync_batch_norm_pass); diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc index fda43948d567689103815e3ad7ba285719dae80f..cab69c408defadad32eba83e47d18f0f82ccc771 100644 --- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include +#include #include #include "paddle/fluid/framework/ir/graph_viz_pass.h" @@ -123,6 +125,7 @@ std::unique_ptr TransposeFlattenConcatFusePass::ApplyImpl( } template class TransposeFlattenConcatFusePass<1>; +template class TransposeFlattenConcatFusePass<2>; template class TransposeFlattenConcatFusePass<3>; template class TransposeFlattenConcatFusePass<4>; template class TransposeFlattenConcatFusePass<5>; @@ -135,6 +138,9 @@ template class TransposeFlattenConcatFusePass<6>; REGISTER_PASS(transpose_flatten_concat_fuse_pass, paddle::framework::ir::TransposeFlattenConcatFusePass<1>); +REGISTER_PASS(transpose_flatten2_concat_fuse_pass, + paddle::framework::ir::TransposeFlattenConcatFusePass<2>); + REGISTER_PASS(transpose_flatten3_concat_fuse_pass, paddle::framework::ir::TransposeFlattenConcatFusePass<3>); diff --git a/paddle/fluid/framework/no_need_buffer_vars_inference.h b/paddle/fluid/framework/no_need_buffer_vars_inference.h new file mode 100644 index 0000000000000000000000000000000000000000..2c933659840d02e65c3b222144a31e558e8e8ae8 --- /dev/null +++ b/paddle/fluid/framework/no_need_buffer_vars_inference.h @@ -0,0 +1,60 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/op_desc.h" + +namespace paddle { +namespace framework { + +class NoNeedBufferVarsInference { + public: + NoNeedBufferVarsInference(const VariableNameMap &inputs, + const VariableNameMap &outputs, + const AttributeMap &attrs) + : inputs_(inputs), outputs_(outputs), attrs_(attrs) {} + + virtual ~NoNeedBufferVarsInference() = default; + + const VariableNameMap &Inputs() const { return inputs_; } + + const VariableNameMap &Outputs() const { return outputs_; } + + const AttributeMap &Attrs() const { return attrs_; } + + virtual std::unordered_set operator()() const = 0; + + private: + const VariableNameMap &inputs_; + const VariableNameMap &outputs_; + const AttributeMap &attrs_; +}; + +#define DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(class_type, ...) \ + class class_type : public ::paddle::framework::NoNeedBufferVarsInference { \ + public: \ + using ::paddle::framework::NoNeedBufferVarsInference:: \ + NoNeedBufferVarsInference; \ + \ + std::unordered_set operator()() const override { \ + return {__VA_ARGS__}; \ + } \ + } + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 0e7b0cbeb98f3b6bbf0b37f507fc6022be692bb1..353db435213c74982d582e5be298ecfb1a810f30 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/var_type_inference.h" namespace paddle { namespace framework { @@ -372,6 +373,11 @@ std::vector OpDesc::AttrNames() const { return retv; } +void OpDesc::RemoveAttr(const std::string &name) { + attrs_.erase(name); + need_update_ = true; +} + void OpDesc::SetAttr(const std::string &name, const Attribute &v) { // NOTICE(minqiyang): pybind11 will take the empty list in python as // the std::vector type in C++; so we have to change the attr's type @@ -643,6 +649,7 @@ void OpDesc::CheckAttrs() { // not by users. return; } + VLOG(10) << "begin to check attribute of " << Type(); checker->Check(&attrs_); } @@ -677,7 +684,8 @@ void OpDesc::InferVarType(BlockDesc *block) const { // var type inference. Hence, we don't do any "default" setting here. auto &info = OpInfoMap::Instance().Get(this->Type()); if (info.infer_var_type_) { - info.infer_var_type_(*this, block); + InferVarTypeContext context(this, block); + info.infer_var_type_(&context); } } diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index d7352c5ee5a63bc8b8023e1d3459c5b9f5fab8a7..dedaf24364703877a4cacb23a27550b54dad53f8 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -72,6 +72,7 @@ class OpDesc { std::vector AttrNames() const; void SetAttr(const std::string &name, const Attribute &v); + void RemoveAttr(const std::string &name); void SetBlockAttr(const std::string &name, BlockDesc *block); diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h index 4b55bd0703eee399cd841f90ea0b18d8fbdc67e8..e200d188b3f2462657bbac086d7659b1f85e55e9 100644 --- a/paddle/fluid/framework/op_info.h +++ b/paddle/fluid/framework/op_info.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/platform/macros.h" @@ -39,6 +40,7 @@ struct OpInfo { InferVarTypeFN infer_var_type_; InferShapeFN infer_shape_; InferInplaceOpFN infer_inplace_; + InferNoNeedBufferVarsFN infer_no_need_buffer_vars_; bool HasOpProtoAndChecker() const { return proto_ != nullptr && checker_ != nullptr; @@ -64,6 +66,10 @@ struct OpInfo { } const OpAttrChecker* Checker() const { return checker_; } + + const InferNoNeedBufferVarsFN& NoNeedBufferVarsInferer() const { + return infer_no_need_buffer_vars_; + } }; class OpInfoMap { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index df1689764d21fcbb054a0bf32ef725541bdaefe3..b0ac73f9f52076a9303417bc1b19208ba6e6f2ec 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include #include #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/executor.h" @@ -64,9 +65,9 @@ static DDim GetDims(const Scope& scope, const std::string& name, if (var->IsType()) { const LoDTensor& tensor = var->Get(); - if (UNLIKELY(!tensor.IsInitialized())) { - return DDim({-1}); - } + // if (UNLIKELY(!tensor.IsInitialized())) { + // return DDim({-1}); + // } return tensor.dims(); } else if (var->IsType()) { if (get_actual_dim) { @@ -132,9 +133,9 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { if (var->IsType()) { const LoDTensor& tensor = var->Get(); - if (UNLIKELY(!tensor.IsInitialized())) { - return default_lod; - } + // if (UNLIKELY(!tensor.IsInitialized())) { + // return default_lod; + // } return tensor.lod(); } else { return default_lod; @@ -186,14 +187,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { VLOG(3) << place << " " << DebugStringEx(&scope); } catch (platform::EnforceNotMet exception) { if (Attrs().count("sub_block") != 0) { - throw; + throw std::move(exception); } auto& callstack = Attr>( OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); if (callstack.empty()) { - throw; + throw std::move(exception); } std::ostringstream sout; sout << "Invoke operator " << Type() << " error.\n"; @@ -204,7 +205,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { sout << "C++ Callstacks: \n"; sout << exception.err_str_; exception.err_str_ = sout.str(); - throw; + throw std::move(exception); } catch (...) { std::rethrow_exception(std::current_exception()); } @@ -326,7 +327,12 @@ OperatorBase::OperatorBase(const std::string& type, const VariableNameMap& inputs, const VariableNameMap& outputs, const AttributeMap& attrs) - : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) { + : type_(type), + inputs_(inputs), + outputs_(outputs), + attrs_(attrs), + // NOTE(zjl): why op_info may be nullptr? + info_(OpInfoMap::Instance().GetNullable(type)) { GenerateTemporaryNames(); CheckAllInputOutputSet(); } @@ -350,7 +356,7 @@ std::vector OperatorBase::OutputVars(bool has_intermediate) const { } return ret_val; } - auto& info = OpInfoMap::Instance().Get(Type()); + auto& info = Info(); // get all OpProto::Var for outputs for (auto& o : info.Proto().outputs()) { @@ -366,18 +372,16 @@ std::vector OperatorBase::OutputVars(bool has_intermediate) const { } void OperatorBase::CheckAllInputOutputSet() const { - auto& info_map = OpInfoMap::Instance(); - auto* op_info = info_map.GetNullable(Type()); - if (op_info == nullptr || op_info->proto_ == nullptr) return; + if (info_ == nullptr || info_->proto_ == nullptr) return; - for (auto& in : op_info->Proto().inputs()) { + for (auto& in : info_->Proto().inputs()) { if (!in.dispensable()) { PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(), "Operator %s's input, %s, is not set", Type(), in.name()); } } - for (auto& out : op_info->Proto().outputs()) { + for (auto& out : info_->Proto().outputs()) { if (!out.dispensable()) { PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(), "Operator %s's output, %s, is not set", Type(), @@ -876,7 +880,22 @@ std::vector* OperatorWithKernel::GetKernelConfig( void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { - RuntimeContext ctx(Inputs(), Outputs(), scope); + if (!HasAttr(kEnableCacheRuntimeContext)) { + RuntimeContext ctx(Inputs(), Outputs(), scope); + RunImpl(scope, place, &ctx); + } else { + const Scope* cur_scope = &scope; + if (!runtime_ctx_ || pre_scope_ != cur_scope) { + runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); + pre_scope_ = cur_scope; + } + RunImpl(scope, place, runtime_ctx_.get()); + } +} + +void OperatorWithKernel::RunImpl(const Scope& scope, + const platform::Place& place, + RuntimeContext* runtime_ctx) const { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -891,7 +910,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, OpKernelMap& kernels = kernels_iter->second; auto expected_kernel_key = this->GetExpectedKernelType( - ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr)); + ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx, nullptr)); VLOG(3) << "expected_kernel_key:" << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); @@ -915,8 +934,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // do data transformScope &transfer_scope; std::vector transfered_inplace_vars; - auto* transfer_scope = - PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx); + auto* transfer_scope = PrepareData(scope, expected_kernel_key, + &transfered_inplace_vars, runtime_ctx); // exec scope is the scope that kernel actually executed on. const Scope& exec_scope = @@ -926,12 +945,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope, dev_ctx = pool.Get(expected_kernel_key.place_); } - RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); - this->InferShape(&infer_shape_ctx); + if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) { + RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx); + this->InferShape(&infer_shape_ctx); + } // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext // not Scope. Imperative mode only pass inputs and get outputs. - kernel_iter->second( - ExecutionContext(*this, exec_scope, *dev_ctx, ctx, kernel_configs)); + kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, + *runtime_ctx, kernel_configs)); if (!transfered_inplace_vars.empty()) { // there is inplace variable has been transfered. @@ -980,7 +1001,27 @@ Scope* OperatorWithKernel::PrepareData( std::vector* transfered_inplace_vars, RuntimeContext* ctx) const { Scope* new_scope = nullptr; + + std::unordered_set no_buffer_ins; + if (info_) { + auto& no_buffer_inferer = info_->NoNeedBufferVarsInferer(); + // Some op may not register NoNeedBufferVarsInferer + if (no_buffer_inferer) { + no_buffer_ins = no_buffer_inferer(Inputs(), Outputs(), Attrs()); + } + } + for (auto& var_name_item : Inputs()) { + // NOTE(zjl): STL does not guarantee fast std::unordered_set::count when set + // is empty. At least STL implemented on my mac does calculate hash code + // of search key even though the set is empty. + if (!no_buffer_ins.empty() && + no_buffer_ins.count(var_name_item.first) > 0) { + VLOG(1) << "Skip scanning input " << var_name_item.first + << " in Operator " << type_; + continue; + } + std::vector& input_vars = ctx->inputs[var_name_item.first]; for (size_t i = 0; i < var_name_item.second.size(); ++i) { @@ -1069,8 +1110,9 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( proto::VarType::Type tmp = t->type(); PADDLE_ENFORCE( tmp == data_type || data_type == dafault_data_type, - "DataType of Paddle Op %s must be the same. Get (%d) != (%d)", - Type(), DataTypeToString(data_type), DataTypeToString(tmp)); + "DataType of Paddle Op %s %s must be the same. Get (%d) != (%d)", + Type(), input.first, DataTypeToString(data_type), + DataTypeToString(tmp)); data_type = tmp; } } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 55629636a816982c4debe4b5b7138558ac309eb5..6d8ba430bd0b9c9e48b4a80a07feb24b2da7d7b8 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -62,6 +62,23 @@ constexpr char kZeroVarSuffix[] = "@ZERO"; /// Variables with this suffix are the new Gradient. constexpr char kNewGradSuffix[] = "@NEWGRAD@"; +/// RuntimeContext is used to relate input/output names of Operator with +/// the corresponding variables in name scope. +/// If an Op has attribute kEnableCacheRuntimeContext, it means that in a same +/// name scope, since the input/output names of this Op do not change in the +/// execution, RuntimeContext could be created only at the first iteration of +/// this Op's execution to save the elapsed time. +constexpr char kEnableCacheRuntimeContext[] = "@ENABLE_CACHE_RUNTIME_CONTEXT@"; + +/// If an Op has this attribute, all its kernels should calculate output +/// variable's shape in the corresponding Compute() function. And +/// OperatorWithKernel::RunImpl() would skip call this Op's InferShape() +/// function in its runtime for speedup. +/// TODO(luotao): Note that this temporal attribute would be deleted after all +/// ops contain it. +constexpr char kAllKernelsMustComputeRuntimeShape[] = + "@ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE@"; + // define some kernel priority /* Define multiple kernel type fallback order*/ extern std::vector> kKernelPriority; @@ -143,6 +160,11 @@ class OperatorBase { const VariableNameMap& Inputs() const { return inputs_; } const VariableNameMap& Outputs() const { return outputs_; } + const OpInfo& Info() const { + PADDLE_ENFORCE_NOT_NULL(info_, "OpInfo of %s is not found", type_); + return *info_; + } + bool HasInputs(const std::string& name) const; //! Get a input with argument's name described in `op_proto` std::string Input(const std::string& name) const; @@ -177,6 +199,10 @@ class OperatorBase { // IG (Inputs Gradients) VariableNameMap outputs_; AttributeMap attrs_; + + // OpInfo + const OpInfo* info_; + // Whether this operator executes in an Executor. bool run_by_executor_{true}; @@ -339,9 +365,6 @@ class ExecutionContext { auto shared_allocation = std::shared_ptr( allocation_ptr, deleter); - PADDLE_ENFORCE( - dynamic_cast(allocation_ptr) != nullptr, - "The AllocationPtr must be TemporaryAllocation."); PADDLE_ENFORCE_GE(allocation_ptr->size(), framework::product(dim) * sizeof(T)); @@ -427,7 +450,7 @@ class OperatorWithKernel : public OperatorBase { } virtual void InferShape(InferShapeContext* ctx) const { - OpInfoMap::Instance().Get(Type()).infer_shape_(ctx); + Info().infer_shape_(ctx); } void RuntimeInferShape(const Scope& scope, const platform::Place& place, @@ -447,6 +470,8 @@ class OperatorWithKernel : public OperatorBase { // same. proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const; void RunImpl(const Scope& scope, const platform::Place& place) const final; + void RunImpl(const Scope& scope, const platform::Place& place, + RuntimeContext* runtime_ctx) const; /** * Transfer data from scope to a transfered scope. If there is no data need to @@ -465,6 +490,8 @@ class OperatorWithKernel : public OperatorBase { protected: mutable OpKernelConfigsMap kernel_configs_map_; + mutable std::unique_ptr runtime_ctx_; + mutable const Scope* pre_scope_ = nullptr; }; extern bool OpSupportGPU(const std::string& op_type); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 3e1d61813ca83ebdf9435036117e79abe501b24b..20a8c47d5d85f5962d697b48ec1bdaad74cbe4d7 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -14,8 +14,10 @@ limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" #include +#include #include #include +#include #include #include "paddle/fluid/framework/ir/graph_helper.h" @@ -181,13 +183,14 @@ std::vector &ParallelExecutor::GetLocalScopes() { return member_->local_scopes_; } -ParallelExecutor::ParallelExecutor( - const std::vector &places, - const std::unordered_set &bcast_vars, - const std::string &loss_var_name, Scope *scope, - const std::vector &local_scopes, - const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, - ir::Graph *graph) +ParallelExecutor::ParallelExecutor(const std::vector &places, + const std::vector &bcast_vars, + const std::string &loss_var_name, + Scope *scope, + const std::vector &local_scopes, + const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy, + ir::Graph *graph) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; member_->use_cuda_ = exec_strategy.use_cuda_; @@ -250,13 +253,52 @@ ParallelExecutor::ParallelExecutor( member_->nccl_ctxs_.reset(new platform::NCCLContextMap( member_->places_, nccl_id, build_strategy.num_trainers_, build_strategy.trainer_id_)); + + // Initialize device context's nccl comm, will be used by normal + // Operators like sync_batch_norm, and collective ops. + // NOTE: more than one ParallelExecutor with same place, the nccl comm will + // be rewrite and there will be some problem. + // NOTE: NCCL group-calls and non-group-calls can not use the same + // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use + // same communicators. + std::unique_ptr dev_nccl_ctxs; + if (nccl_id == nullptr) { + dev_nccl_ctxs.reset(new platform::NCCLContextMap(member_->places_)); + } + for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto *dev_ctx = static_cast( + pool.Get(member_->places_[dev_id])); + if (nccl_id != nullptr) { + auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[dev_id]); + dev_ctx->set_nccl_comm(nccl_ctx.comm()); + } else { + auto &nccl_ctx = dev_nccl_ctxs->at(member_->places_[dev_id]); + dev_ctx->set_nccl_comm(nccl_ctx.comm()); + } + } #else PADDLE_THROW("Not compiled with CUDA"); #endif } - if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { - BCastParamsToDevices(bcast_vars); + // broadcast parameters from the 0th device to others: + auto need_broadcast = [&]() -> bool { + if (build_strategy.num_trainers_ > 1) { + // 1. num_tariners would be grater than 1 for nccl distributed training. + return true; + } else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { + // 2. Only one trainer process, but ParallelExecutor hold multiple + // devices. + return true; + } + return false; + }; + + if (need_broadcast()) { + BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_); } + // Startup Program has been run. All local scopes has correct parameters. // Step 2. Convert main_program to SSA form and dependency graph. Also, insert @@ -338,7 +380,7 @@ ParallelExecutor::ParallelExecutor( } void ParallelExecutor::BCastParamsToDevices( - const std::unordered_set &vars) const { + const std::vector &vars, int trainer_id) const { // the initializing bcast, all vars would be bcast from device(0). for (auto &var : vars) { framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); @@ -362,7 +404,7 @@ void ParallelExecutor::BCastParamsToDevices( auto place = member_->places_[i]; void *buffer; - if (i == 0) { + if (i == 0 && trainer_id == 0) { buffer = const_cast(main_tensor.data()); } else { auto local_scope = member_->local_scopes_[i]; diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index ddf60b39466e72822142e1dad2cfe9a97b6cf6f2..d4658b9623fe8c23b6a8b2903e3c48d794ba1652 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -14,9 +14,11 @@ limitations under the License. */ #pragma once +#include #include #include #include +#include #include #include "paddle/fluid/framework/details/build_strategy.h" @@ -45,7 +47,7 @@ class ParallelExecutor { public: explicit ParallelExecutor(const std::vector &places, - const std::unordered_set &bcast_vars, + const std::vector &bcast_vars, const std::string &loss_var_name, Scope *scope, const std::vector &local_scopes, const ExecutionStrategy &exec_strategy, @@ -70,7 +72,10 @@ class ParallelExecutor { const std::string &fetched_var_name); private: - void BCastParamsToDevices(const std::unordered_set &vars) const; + // broadcast the parameters from the 0th device. + // trainer_id the trainer index in nccl distributed training. + void BCastParamsToDevices(const std::vector &vars, + int trainer_id = 0) const; bool EnableParallelGraphExecution(const ir::Graph &graph, const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy) const; diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 87f0f307d30bc90a43a698c3766b16c975f0635e..a96baaf41f3fcd24817421a7b620343558cd78d1 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -29,15 +29,6 @@ DEFINE_bool( "Delete local scope eagerly. It will reduce GPU memory usage but " "slow down the destruction of variables.(around 1% performance harm)"); -DEFINE_double( - eager_delete_tensor_gb, -1.0, - "Memory size threshold (GB) when the garbage collector clear tensors." - "Disabled when this value is less than 0"); - -DEFINE_bool(fast_eager_deletion_mode, false, - "Fast eager deletion mode. If enabled, memory would release " - "immediately without waiting GPU kernel ends."); - // When in inference scenario, the scopes will not be written by two threads in // a mean time, but a scope may be read by multiple threads concurrently, and // the mutex will cause serious performance issue. @@ -57,15 +48,6 @@ DEFINE_bool(fast_eager_deletion_mode, false, namespace paddle { namespace framework { -int64_t GetEagerDeletionThreshold() { - return FLAGS_eager_delete_tensor_gb < 0 - ? -1 - : static_cast(FLAGS_eager_delete_tensor_gb * - (static_cast(1) << 30)); -} - -bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; } - Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index f0915d2eee072b0bcd53f37dad5ef9d801c87172..242cbae7163c48fa44dca9237f1cd35f9ec98442 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -32,9 +32,6 @@ extern "C" { namespace paddle { namespace framework { -int64_t GetEagerDeletionThreshold(); -bool IsFastEagerDeletionModeEnabled(); - class Scope; /** diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 89166bfd15f26e066d32a7191217a9b9a8977bda..5f21dae60586e926472fc512eca7bcbb55dc8eda 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -18,6 +18,7 @@ #include #include #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -43,6 +44,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, << dst_place; return; } +#ifdef PADDLE_WITH_MKLDNN + if (src.layout() == DataLayout::kMKLDNN) { + dst->set_mkldnn_prim_desc(src.get_mkldnn_prim_desc()); + } +#endif memory::Copy(boost::get(dst_place), dst_ptr, boost::get(src_place), src_ptr, size); } @@ -137,16 +143,19 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, #ifdef PADDLE_WITH_CUDA else if (platform::is_gpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { + platform::RecordEvent record_event("TensorCopy:GPU->CPU"); auto src_gpu_place = boost::get(src_place); auto dst_cpu_place = boost::get(dst_place); memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); } else if (platform::is_cpu_place(src_place) && platform::is_gpu_place(dst_place)) { + platform::RecordEvent record_event("TensorCopy:CPU->GPU"); auto src_cpu_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); } else if (platform::is_gpu_place(src_place) && platform::is_gpu_place(dst_place)) { + platform::RecordEvent record_event("TensorCopy:GPU->GPU"); if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) { VLOG(3) << "Skip copy the same data from " << src_place << " to " << dst_place; @@ -157,6 +166,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); } else if (platform::is_cuda_pinned_place(src_place) && platform::is_gpu_place(dst_place)) { + platform::RecordEvent record_event("TensorCopy:CUDAPinned->GPU"); auto src_pinned_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size, diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index d02c699b979d7693bd83fd43fc73f7e0aeddb0cc..4ae6a272d5b043f25015ad8d5cfc2139d394ed5c 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -27,8 +27,10 @@ namespace framework { class OperatorBase; class OpDesc; class InferShapeContext; +class InferVarTypeContext; class BlockDesc; class Variable; +class NoNeedBufferVarsInference; using VariableNameMap = std::map>; // TODO(panyx0718): Replace vector with something like gtl::Vector. @@ -53,12 +55,16 @@ using GradOpMakerFN = std::function>( const std::vector& grad_block)>; using InferVarTypeFN = - std::function; + std::function; using InferShapeFN = std::function; using InplacePair = std::unordered_map; -using InferInplaceOpFN = std::function; +using InferInplaceOpFN = std::function; + +using InferNoNeedBufferVarsFN = std::function( + const VariableNameMap& /*inputs*/, const VariableNameMap& /*outputs*/, + const AttributeMap& /*attrs*/)>; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h index 64236b78d2e390ea5f6c43c76a4b33b62c67629f..2e9c64d3e6854bf70c0aee06128b9f1b7c8c7439 100644 --- a/paddle/fluid/framework/var_type_inference.h +++ b/paddle/fluid/framework/var_type_inference.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once #include +#include +#include #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/type_defs.h" @@ -21,26 +23,123 @@ limitations under the License. */ namespace paddle { namespace framework { +class OpDesc; +class BlockDesc; +// default infer var type context +class InferVarTypeContext { + public: + InferVarTypeContext(const OpDesc* op, BlockDesc* block) + : op_(op), block_(block) {} + + virtual ~InferVarTypeContext() {} + + virtual Attribute GetAttr(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(op_); + return op_->GetAttr(name); + } + + virtual bool HasVar(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(block_); + return block_->FindVarRecursive(name) != nullptr; + } + + virtual bool HasInput(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(op_); + return op_->Inputs().count(name) > 0; + } + + virtual bool HasOutput(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(op_); + return op_->Outputs().count(name) > 0; + } + + virtual const std::vector& Input(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(op_); + return op_->Input(name); + } + + virtual const std::vector& Output( + const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(op_); + return op_->Output(name); + } + + virtual proto::VarType::Type GetType(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(block_); + return block_->FindRecursiveOrCreateVar(name).GetType(); + } + + virtual void SetType(const std::string& name, proto::VarType::Type type) { + PADDLE_ENFORCE_NOT_NULL(block_); + block_->FindRecursiveOrCreateVar(name).SetType(type); + } + + virtual proto::VarType::Type GetDataType(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(block_); + return block_->FindRecursiveOrCreateVar(name).GetDataType(); + } + + virtual void SetDataType(const std::string& name, proto::VarType::Type type) { + PADDLE_ENFORCE_NOT_NULL(block_); + block_->FindRecursiveOrCreateVar(name).SetDataType(type); + } + + virtual std::vector GetDataTypes( + const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(block_); + return block_->FindRecursiveOrCreateVar(name).GetDataTypes(); + } + + virtual void SetDataTypes( + const std::string& name, + const std::vector& multiple_data_type) { + PADDLE_ENFORCE_NOT_NULL(block_); + block_->FindRecursiveOrCreateVar(name).SetDataTypes(multiple_data_type); + } + + virtual std::vector GetShape(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(block_); + return block_->FindRecursiveOrCreateVar(name).GetShape(); + } + + virtual void SetShape(const std::string& name, + const std::vector& dims) { + PADDLE_ENFORCE_NOT_NULL(block_); + block_->FindRecursiveOrCreateVar(name).SetShape(dims); + } + + virtual int32_t GetLoDLevel(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(block_); + return block_->FindRecursiveOrCreateVar(name).GetLoDLevel(); + } + + virtual void SetLoDLevel(const std::string& name, int32_t lod_level) { + PADDLE_ENFORCE_NOT_NULL(block_); + block_->FindRecursiveOrCreateVar(name).SetLoDLevel(lod_level); + } + + protected: + const OpDesc* op_; + BlockDesc* block_; +}; + class VarTypeInference { public: virtual ~VarTypeInference() {} - virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0; + virtual void operator()(InferVarTypeContext* context) const = 0; // NOLINT }; class PassInDtypeAndVarTypeToOutput : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const final { + void operator()(framework::InferVarTypeContext* ctx) const final { // NOLINT auto in_out_var_names = this->GetInputOutputWithSameType(); for (auto& i_o_n : in_out_var_names) { - auto& x_name = op_desc.Input(i_o_n.first).at(0); - auto& out_name = op_desc.Output(i_o_n.second).at(0); + auto& x_name = ctx->Input(i_o_n.first).at(0); + auto& out_name = ctx->Output(i_o_n.second).at(0); - auto& x = block->FindRecursiveOrCreateVar(x_name); - auto& out = block->FindRecursiveOrCreateVar(out_name); - out.SetType(x.GetType()); - out.SetDataType(x.GetDataType()); + ctx->SetType(out_name, ctx->GetType(x_name)); + ctx->SetDataType(out_name, ctx->GetDataType(x_name)); } } diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc index 2a75394fca719196a9d53894b080598e942baa45..6bbb25a573d076d5ec6d6fd960a304639e9e3d49 100644 --- a/paddle/fluid/framework/var_type_inference_test.cc +++ b/paddle/fluid/framework/var_type_inference_test.cc @@ -44,20 +44,20 @@ class SumOpMaker : public OpProtoAndCheckerMaker { class SumOpVarTypeInference : public VarTypeInference { public: - void operator()(const OpDesc &op_desc, BlockDesc *block) const override { - auto &inputs = op_desc.Input("X"); + void operator()(framework::InferVarTypeContext *ctx) const override { + auto &inputs = ctx->Input("X"); auto default_var_type = proto::VarType::SELECTED_ROWS; bool any_input_is_lod_tensor = std::any_of( - inputs.begin(), inputs.end(), [block](const std::string &name) { - return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR; + inputs.begin(), inputs.end(), [&ctx](const std::string &name) { + return ctx->GetType(name) == proto::VarType::LOD_TENSOR; }); if (any_input_is_lod_tensor) { default_var_type = proto::VarType::LOD_TENSOR; } - auto out_var_name = op_desc.Output("Out").front(); - block->Var(out_var_name)->SetType(default_var_type); + auto out_var_name = ctx->Output("Out").front(); + ctx->SetType(out_var_name, default_var_type); } }; } // namespace framework diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index ec8dedd605235a2d197e6a313bd589d5b9520cdf..0d116a6495477ca69c10c130e63247a4f6c03b23 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -2,4 +2,5 @@ if(WITH_PYTHON) cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind) cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind) cc_library(engine SRCS engine.cc) +cc_library(imperative_profiler SRCS profiler.cc) endif() diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 012dfc1c7f66027bc5375794e0d70ed78e70e781..036d2a50a4a7ea3ce7e052a56202b1d54465b03e 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -159,10 +159,9 @@ class Autograd { for (auto it : candidate->pre_ops_) { for (OpBase* pre_op : it.second) { if (!pre_op) continue; - VLOG(5) << "op dep " << candidate->op_desc_->Type() << " trace id " + VLOG(5) << "op dep " << candidate->Type() << " trace id " << candidate->trace_id_ << " <---- " << it.first << " <---- " - << pre_op->op_desc_->Type() << " trace id " - << pre_op->trace_id_; + << pre_op->Type() << " trace id " << pre_op->trace_id_; if (visited.find(pre_op) == visited.end()) { visited.insert(pre_op); queue.push_back(pre_op); @@ -180,10 +179,12 @@ std::unique_ptr VarBase::NewVarBase(const platform::Place& dst_place, PADDLE_ENFORCE(var_->IsInitialized(), "Variable must be initialized when getting numpy tensor"); - std::unique_ptr new_var(new VarBase()); + // TODO(minqiyang): change this after move unique_name generator to CXX + const framework::LoDTensor& self_tensor = var_->Get(); + std::unique_ptr new_var(new VarBase( + "Itmp", self_tensor.type(), self_tensor.dims(), dst_place, true, false)); framework::LoDTensor* tensor = new_var->var_->GetMutable(); - tensor->Resize(var_->Get().dims()); tensor->set_lod(var_->Get().lod()); if (blocking) { @@ -199,59 +200,103 @@ std::unique_ptr VarBase::NewVarBase(const platform::Place& dst_place, } if (platform::is_gpu_place(dst_place)) { - VLOG(3) << "copy tensor " << var_desc_->Name() << " from gpu"; + VLOG(3) << "copy tensor " << Name() << " from gpu"; } return new_var; } framework::LoDTensor& VarBase::GradValue() { - VLOG(3) << "get var grad " << var_desc_->Name(); + VLOG(3) << "get var grad " << Name(); + PADDLE_ENFORCE_NOT_NULL(grads_, + "Could not get grad value from no grad variable"); return *(grads_->var_->GetMutable()); } std::map> OpBase::ApplyGrad() { - if (grad_op_descs_.empty() && backward_id_ <= 0) { - VLOG(3) << "op with no grad: " << op_desc_->Type(); - return {}; - } + PADDLE_ENFORCE(!grad_op_descs_.empty() || backward_id_ > 0, + "%s has no backward implementation", Type()); - VLOG(3) << "apply op grad: " << op_desc_->Type(); - std::vector grad_outputs; + VLOG(3) << "apply op grad: " << Type(); + std::vector tmp_grad_outputs; if (backward_id_ > 0) { VLOG(3) << "py_layer_grad"; - grad_outputs.resize(1); - grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] = + tmp_grad_outputs.resize(1); + tmp_grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad( backward_id_, grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]); } else { - grad_outputs.resize(grad_op_descs_.size()); - for (size_t k = 0; k < grad_op_descs_.size(); ++k) { + const size_t grad_op_count = grad_op_descs_.size(); + + tmp_grad_outputs.resize(grad_op_count); + for (size_t k = 0; k < grad_op_count; ++k) { framework::OpDesc* grad_op_desc = grad_op_descs_[k]; - VLOG(3) << "op grad " << grad_op_desc->Type(); - for (auto it : grad_output_vars_[k]) { - auto& outputs = grad_outputs[k][it.first]; + auto& grad_output_variable_map = grad_output_vars_[k]; + + VLOG(3) << "apply grad op " << grad_op_desc->Type(); + + // Allocate tmp grad output variable + for (const auto& it : grad_output_variable_map) { + auto& outputs = tmp_grad_outputs[k][it.first]; + outputs.reserve(it.second.size()); for (size_t i = 0; i < it.second.size(); ++i) { + VarBase* origin_grad_var_base = it.second[i]; + // Allocate a new variable - Variable* tmp_var = new framework::Variable(); - tmp_var->GetMutable(); - outputs.push_back(tmp_var); + VarBase* tmp_grad_var_base = new VarBase( + string::Sprintf("%s@IGrad", origin_grad_var_base->Name()), + origin_grad_var_base->DataType(), origin_grad_var_base->Dims(), + place_, true, false); + outputs.emplace_back(tmp_grad_var_base); } } - framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]); - // No need to do compile time infer shape here. // grad_op_desc_->InferShape(*block_); - grad_op_desc->InferVarType(block_); + // grad_op_desc->InferVarType(block_); std::unique_ptr opbase = framework::OpRegistry::CreateOp(*grad_op_desc); + + auto& info = framework::OpInfoMap::Instance().Get(grad_op_desc->Type()); + if (info.infer_var_type_) { + RuntimeInferVarTypeContext infer_var_type_ctx( + &grad_input_vars_[k], &tmp_grad_outputs[k], &attrs_); + info.infer_var_type_(&infer_var_type_ctx); + } + framework::OperatorWithKernel* op_kernel = dynamic_cast(opbase.get()); PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); + // Run grad op + framework::VariableValueMap grad_invars_map; + framework::VariableValueMap grad_outvars_map; + + for (const auto& it : grad_input_vars_[k]) { + auto& grad_invars = grad_invars_map[it.first]; + grad_invars.reserve(it.second.size()); + for (const VarBase* grad_inp : it.second) { + PADDLE_ENFORCE_NOT_NULL(grad_inp->var_, "op %s input %s nullptr", + grad_op_desc->Type(), grad_inp->Name()); + + grad_invars.emplace_back(grad_inp->var_); + } + } + + for (const auto& it : tmp_grad_outputs[k]) { + auto& grad_outvars = grad_outvars_map[it.first]; + grad_outvars.reserve(it.second.size()); + for (VarBase* grad_out : it.second) { + PADDLE_ENFORCE_NOT_NULL(grad_out->var_, "op %s output %s nullptr", + grad_op_desc->Type(), grad_out->Name()); + + grad_outvars.emplace_back(grad_out->var_); + } + } + + framework::RuntimeContext ctx(grad_invars_map, grad_outvars_map); framework::Scope scope; PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_); p.op.RuntimeInferShape(scope, place_, ctx); @@ -260,15 +305,19 @@ std::map> OpBase::ApplyGrad() { } } + // Add tmp grad outputs to original grad vars for (size_t k = 0; k < grad_output_vars_.size(); ++k) { - for (auto it : grad_output_vars_[k]) { - auto& outputs = grad_outputs[k][it.first]; - auto& origin_outputs = it.second; + for (const auto& it : grad_output_vars_[k]) { + auto& outputs = tmp_grad_outputs[k][it.first]; + const auto& origin_outputs = it.second; PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); for (size_t i = 0; i < outputs.size(); ++i) { - framework::Variable* grad = outputs[i]; - framework::Variable* orig_grad = origin_outputs[i]; + framework::Variable* grad = outputs[i]->var_; + framework::Variable* orig_grad = origin_outputs[i]->var_; + VLOG(3) << "AddTo Called with orig_grad is: " + << origin_outputs[i]->name_ << " Grad to be added is " + << outputs[i]->name_; AddTo(grad, orig_grad, place_); delete grad; } @@ -316,33 +365,35 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) { int PyLayer::NumFuncs() { return py_funcs_.size(); } -std::vector PyLayer::Apply(int func_id, - const std::vector& inputs) { - std::vector invars; - for (const VarBase* in : inputs) { - invars.push_back(in->var_); - } +std::vector PyLayer::Apply( + int func_id, const std::vector& inputs) { PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end()); - std::vector outvars = CallPythonFunc(py_funcs_[func_id], invars); - std::vector ret; - for (Variable* v : outvars) { - ret.push_back(new VarBase(v, new VarBase(true))); - } - return ret; + return CallPythonFunc(py_funcs_[func_id], inputs); } -std::vector PyLayer::ApplyGrad( - int func_id, const std::vector& inputs) { +std::vector PyLayer::ApplyGrad(int func_id, + const std::vector& inputs) { PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end()); - return CallPythonFunc(py_funcs_[func_id], inputs); + auto rets = CallPythonFunc(py_funcs_[func_id], inputs); + + std::vector outs; + outs.reserve(rets.size()); + for (size_t i = 0U; i != rets.size(); ++i) { + outs.emplace_back(new VarBase( + string::Sprintf("%s_out_%d", framework::GradVarName(PyLayer::kFwdOut), + i), + rets[i], nullptr, true)); + } + + return outs; } std::vector PyLayer::CallPythonFunc( - const py::object& callable, const std::vector& ins) { + const py::object& callable, const std::vector& ins) { py::gil_scoped_acquire guard; py::tuple in_args(ins.size()); for (size_t i = 0; i < ins.size(); ++i) { - const framework::LoDTensor& t = ins[i]->Get(); + const framework::LoDTensor& t = ins[i]->var_->Get(); in_args[i] = t.IsInitialized() ? py::cast(t) : py::cast(nullptr); } VLOG(3) << "pyfunc in " << py::len(in_args); @@ -352,6 +403,7 @@ std::vector PyLayer::CallPythonFunc( auto ret_tuple = py::cast(ret); size_t ret_num = py::len(ret_tuple); std::vector outs; + outs.reserve(ret_num); VLOG(3) << "pyfunc out " << ret_num; for (size_t i = 0; i < ret_num; ++i) { try { @@ -362,7 +414,7 @@ std::vector PyLayer::CallPythonFunc( auto* tensor = var->GetMutable(); tensor->ShareDataWith(*py_out_tensor); tensor->set_lod(py_out_tensor->lod()); - outs.push_back(var); + outs.emplace_back(var); } catch (py::cast_error&) { PADDLE_THROW("The %d-th output must be LoDTensor", i); } diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 7a9f33dc1e6cbc0c3ec1e649906fb0a8de047189..72c548d5e92dec3ec2638904f508c2777ee327c6 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -18,14 +18,16 @@ #include "paddle/fluid/framework/python_headers.h" // clang-format on -#include // NOLINT -#include // NOLINT -#include // NOLINT -#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/operators/math/math_function.h" @@ -112,31 +114,55 @@ class OpBase; */ class VarBase { public: - VarBase() : VarBase(new framework::Variable(), new VarBase(true)) {} - - explicit VarBase(bool stop_gradient) - : VarBase(new framework::Variable(), - stop_gradient ? nullptr : new VarBase(true), stop_gradient) {} - - VarBase(framework::Variable* var, VarBase* grad) - : VarBase(var, grad, false) {} + // Internal interface, create VarBase from exist variable + VarBase(const std::string& name, framework::Variable* var, VarBase* grad, + bool stop_gradient) + : VarBase(name, var->Get().type(), + var->Get().dims(), + var->Get().place(), var, grad, + stop_gradient, false) {} + + // Python interface + VarBase(const std::string& name, const framework::proto::VarType::Type dtype, + const std::vector& shape, const platform::Place& place, + bool stop_gradient, bool persistable) + : VarBase(name, dtype, framework::make_ddim(shape), place, stop_gradient, + persistable) {} + + // Internal interface, create VarBase from with ddim + VarBase(const std::string& name, const framework::proto::VarType::Type dtype, + const framework::DDim& shape, const platform::Place& place, + bool stop_gradient, bool persistable) + : VarBase(name, dtype, shape, place, nullptr, nullptr, stop_gradient, + persistable) {} private: - VarBase(framework::Variable* var, VarBase* grad, bool stop_gradient) - : name_(), - var_desc_(nullptr), + // TODO(minqiyang): need support SelectedRows + VarBase(const std::string& name, framework::proto::VarType::Type dtype, + const framework::DDim& shape, const platform::Place& place, + framework::Variable* var, VarBase* grad, bool stop_gradient, + bool persistable) + : name_(name), + type_(framework::proto::VarType::LOD_TENSOR), var_(var), grads_(grad), - block_(nullptr), - persistable_(false), stop_gradient_(stop_gradient), + persistable_(persistable), pre_op_(nullptr), pre_op_out_name_(), - pre_op_out_idx_(-1) {} + pre_op_out_idx_(-1) { + if (!var_) { + var_ = new framework::Variable(); + } + auto tensor = var_->GetMutable(); + tensor->Resize(shape); + tensor->mutable_data(place, dtype); + VLOG(10) << "create varbase: " << name_ << " type: " << dtype + << " place: " << place; + } public: virtual ~VarBase() { - // TODO(minqiyang): remove var desc from block desc if (var_) { delete var_; var_ = nullptr; @@ -151,14 +177,46 @@ class VarBase { pre_op_out_idx_ = -1; } - inline OpBase* PreOp() const { return pre_op_; } - inline int PreOpOutIdx() const { return pre_op_out_idx_; } + inline void SetName(const std::string& name) { name_ = name; } + inline std::string Name() const { return name_; } + + inline std::vector Shape() const { + if (var_->IsInitialized()) { + return framework::vectorize(var_->Get().dims()); + } else { + return {}; + } + } + + inline framework::DDim Dims() const { + return var_->Get().dims(); + } + + // data type. e.g.. FP32 + inline void SetDataType(framework::proto::VarType::Type type) { + auto tensor = var_->GetMutable(); + tensor->mutable_data(tensor->place(), type); + } + inline framework::proto::VarType::Type DataType() const { + auto tensor = var_->Get(); + return tensor.type(); + } + + // tensor type. e.g.. LoDTensor + inline void SetType(framework::proto::VarType::Type type) { type_ = type; } + inline framework::proto::VarType::Type Type() const { return type_; } inline void SetStopGradient(bool stop_gradient) { stop_gradient_ = stop_gradient; } inline bool IsStopGradient() const { return stop_gradient_; } + inline void SetPersistable(bool persistable) { persistable_ = persistable; } + inline bool IsPersistable() const { return persistable_; } + + inline OpBase* PreOp() const { return pre_op_; } + inline int PreOpOutIdx() const { return pre_op_out_idx_; } + void RunBackward(); inline void ResetPreOp(OpBase* op) { @@ -180,7 +238,7 @@ class VarBase { } void ClearGradient() { - VLOG(1) << "clear gradient of " << var_desc_->Name(); + VLOG(1) << "clear gradient of " << Name(); if (grads_ && grads_->var_ && grads_->var_->IsInitialized()) { auto grads_t = grads_->var_->GetMutable(); operators::math::set_constant( @@ -196,23 +254,20 @@ class VarBase { const bool blocking) const; inline std::string GradName() const { - PADDLE_ENFORCE( - var_desc_, - "Couldn't get gradient variable's name, please call backward() first"); - return string::Sprintf("%s@IGrad", var_desc_->Name()); + return string::Sprintf("%s@IGrad", Name()); } std::string name_; - framework::VarDesc* var_desc_; + framework::proto::VarType::Type type_; + platform::Place place_; framework::Variable* var_; VarBase* grads_; - framework::BlockDesc* block_; - bool persistable_; - private: bool stop_gradient_; + bool persistable_; + OpBase* pre_op_; std::string pre_op_out_name_; int pre_op_out_idx_; @@ -223,11 +278,11 @@ class VarBase { */ class PYBIND11_HIDDEN OpBase { public: - OpBase() - : op_desc_(nullptr), + OpBase(const std::string& type) + : type_(type), + trace_id_(-1), forward_id_(-1), backward_id_(-1), - trace_id_(-1), place_(platform::CPUPlace()), backward_hooks_() {} @@ -249,13 +304,40 @@ class PYBIND11_HIDDEN OpBase { std::map> ApplyGrad(); + inline std::string Type() const { return type_; } + inline std::string GradOpType(size_t index) const { + PADDLE_ENFORCE_NOT_NULL(grad_op_descs_[index]); + return grad_op_descs_[index]->Type(); + } + void RegisterBackwardHooks(const py::object& callable); void InvokeBackwardHooks(); - // One of `op_desc_` or `forward_id_` is set, not both. - // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_. - framework::OpDesc* op_desc_; + void TrackPreOp(const std::string& inp_name, + const std::vector& inputs) { + auto& pre_ops_list = pre_ops_[inp_name]; + pre_ops_list.reserve(inputs.size()); + auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name]; + for (VarBase* inp_var : inputs) { + if (inp_var->PreOp() && !inp_var->IsStopGradient()) { + VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot " + << inp_name; + pre_ops_list.emplace_back(inp_var->PreOp()); + pre_ops_out_idx_list.push_back(inp_var->PreOpOutIdx()); + } else { + VLOG(3) << "no pre op in slot " << inp_name + << " input var stop_gradient: " << inp_var->IsStopGradient(); + pre_ops_list.emplace_back(nullptr); + // pre_ops_out_idx_list.push_back(-1); + } + } + } + + std::string type_; + // One of `trace_id_` or `forward_id_` is set, not both. + // For pure python PyLayer, use `forward_id_`, otherwise, use trace_id_. + int trace_id_; int forward_id_; // When has backward, one of `grad_op_descs_` or `backward_id_` is set, @@ -263,7 +345,6 @@ class PYBIND11_HIDDEN OpBase { // Note: each fwd op corresponds to a vector of bwd ops. std::vector grad_op_descs_; int backward_id_; - int trace_id_; platform::Place place_; @@ -273,13 +354,13 @@ class PYBIND11_HIDDEN OpBase { std::map> pre_ops_out_idx_; // Inputs to a vector of bwd ops. - std::vector grad_input_vars_; + std::vector grad_input_vars_; // Outputs to a vector of bwd ops. - std::vector grad_output_vars_; - - framework::BlockDesc* block_; + std::vector grad_output_vars_; std::vector backward_hooks_; + + framework::AttributeMap attrs_; }; class Layer { @@ -303,15 +384,134 @@ class PyLayer { static int NumFuncs(); - static std::vector Apply(int func_id, - const std::vector& inputs); + static std::vector Apply( + int func_id, const std::vector& inputs); - static std::vector ApplyGrad( - int func_id, const std::vector& inputs); + static std::vector ApplyGrad(int func_id, + const std::vector& inputs); private: static std::vector CallPythonFunc( - const py::object& callable, const std::vector& ins); + const py::object& callable, const std::vector& ins); +}; + +// infer var type context for imperative mode +class PYBIND11_HIDDEN RuntimeInferVarTypeContext + : public framework::InferVarTypeContext { + public: + RuntimeInferVarTypeContext(const imperative::VarBasePtrMap* inputs, + imperative::VarBasePtrMap* outputs, + const framework::AttributeMap* attrs_map) + : InferVarTypeContext(nullptr, nullptr), + inputs_(inputs), + outputs_(outputs), + attrs_(attrs_map), + input_names_(), + output_names_(), + var_set_() { + input_names_.reserve(inputs_->size()); + for (auto& it : *inputs_) { + for (imperative::VarBase* var : it.second) { + input_names_[it.first].emplace_back(var->Name()); + var_set_[var->Name()] = var; + } + } + + output_names_.reserve(outputs_->size()); + for (auto& it : *outputs_) { + for (imperative::VarBase* var : it.second) { + output_names_[it.first].emplace_back(var->Name()); + var_set_[var->Name()] = var; + } + } + } + + virtual ~RuntimeInferVarTypeContext() {} + + framework::Attribute GetAttr(const std::string& name) const override { + PADDLE_ENFORCE_NOT_NULL(attrs_); + return attrs_->at(name); + } + + bool HasVar(const std::string& name) const override { + return var_set_.count(name) > 0; + } + + bool HasInput(const std::string& name) const override { + PADDLE_ENFORCE_NOT_NULL(inputs_); + return inputs_->count(name) > 0; + } + + bool HasOutput(const std::string& name) const override { + PADDLE_ENFORCE_NOT_NULL(outputs_); + return outputs_->count(name) > 0; + } + + const std::vector& Input( + const std::string& name) const override { + return input_names_.at(name); + } + + const std::vector& Output( + const std::string& name) const override { + return output_names_.at(name); + } + + framework::proto::VarType::Type GetType( + const std::string& name) const override { + return var_set_.at(name)->Type(); + } + + void SetType(const std::string& name, + framework::proto::VarType::Type type) override { + var_set_[name]->SetType(type); + } + + framework::proto::VarType::Type GetDataType( + const std::string& name) const override { + return var_set_.at(name)->DataType(); + } + + void SetDataType(const std::string& name, + framework::proto::VarType::Type type) override { + var_set_[name]->SetDataType(type); + } + + std::vector GetDataTypes( + const std::string& name) const override { + PADDLE_THROW("GetDataTypes is not supported in runtime InferVarType"); + } + + void SetDataTypes(const std::string& name, + const std::vector& + multiple_data_type) override { + PADDLE_THROW("SetDataTypes is not supported in runtime InferVarType"); + } + + std::vector GetShape(const std::string& name) const override { + PADDLE_THROW("Do not handle Shape in runtime InferVarType"); + } + + void SetShape(const std::string& name, + const std::vector& dims) override { + PADDLE_THROW("Do not handle Shape in runtime InferVarType"); + } + + int32_t GetLoDLevel(const std::string& name) const override { + PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType"); + } + + void SetLoDLevel(const std::string& name, int32_t lod_level) override { + PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType"); + } + + private: + const imperative::VarBasePtrMap* inputs_; + imperative::VarBasePtrMap* outputs_; + const framework::AttributeMap* attrs_; + std::unordered_map> input_names_; + std::unordered_map> output_names_; + std::unordered_map var_set_; }; } // namespace imperative diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc new file mode 100644 index 0000000000000000000000000000000000000000..34570b3a60ec83fdeb1577789271942125b16eb1 --- /dev/null +++ b/paddle/fluid/imperative/profiler.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/profiler.h" + +#ifdef WITH_GPERFTOOLS +#include "gperftools/profiler.h" +#endif +#include +#include +#include // NOLINT +#include // NOLINT + +DEFINE_string( + tracer_profile_fname, "xxgperf", + "Profiler filename for imperative tracer, which generated by gperftools." + "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable."); + +namespace paddle { +namespace imperative { + +static std::once_flag gTracerProfileOnce; +#ifdef WITH_GPERFTOOLS +static bool gTracerProfilerStarted = false; +#endif + +void StartProfile() { + if (!FLAGS_tracer_profile_fname.empty()) { + std::call_once(gTracerProfileOnce, [] { +#ifdef WITH_GPERFTOOLS + ProfilerStart(FLAGS_tracer_profile_fname.c_str()); + gTracerProfilerStarted = true; +#else + LOG(WARNING) << "Paddle is not compiled with gperftools. " + "FLAGS_tracer_profile_fname will be ignored"; +#endif + }); + } +} + +void StopProfile() { +#ifdef WITH_GPERFTOOLS + ProfilerFlush(); +#else + LOG(WARNING) << "Paddle is not compiled with gperftools. " + "FLAGS_tracer_profile_fname will be ignored"; +#endif +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/profiler.h b/paddle/fluid/imperative/profiler.h new file mode 100644 index 0000000000000000000000000000000000000000..d52aeed4e81755cfa285616d7b0a7e79061c6af8 --- /dev/null +++ b/paddle/fluid/imperative/profiler.h @@ -0,0 +1,25 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace paddle { +namespace imperative { + +extern void StartProfile(); + +extern void StopProfile(); + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 0cb1676372fdd35a762e897d269550f2d1e1ac36..7c9d0af3ecd647604ab46ee6239fc352e5fd8d85 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -19,52 +19,44 @@ #include #include +#include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -#ifdef WITH_GPERFTOOLS -#include "gperftools/profiler.h" -#endif - -DEFINE_string( - tracer_profile_fname, "", - "Profiler filename for imperative tracer, which generated by gperftools." - "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable."); - namespace paddle { namespace imperative { -static std::once_flag gTracerProfileOnce; -#ifdef WITH_GPERFTOOLS -static bool gTracerProfilerStarted = false; -#endif - void CreateGradOp(const framework::OpDesc& op_desc, const std::unordered_set& no_grad_set, const std::vector& grad_sub_block, std::vector* grad_op_descs, std::unordered_map* grad_to_var) { PADDLE_ENFORCE(grad_op_descs->empty()); - std::vector> descs = - framework::OpInfoMap::Instance() - .Get(op_desc.Type()) - .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); + const framework::OpInfo& op_info = + framework::OpInfoMap::Instance().Get(op_desc.Type()); + if (!op_info.grad_op_maker_) return; + std::vector> descs = + op_info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); for (auto& desc : descs) { grad_op_descs->emplace_back(desc.release()); } } -void InitVar(framework::Variable* var, framework::Variable* grad_var, - platform::DeviceContext* dev_ctx) { +void InitGrad(VarBase* var, platform::DeviceContext* dev_ctx) { + PADDLE_ENFORCE_NOT_NULL(var, "Could not get valid var base"); PADDLE_ENFORCE_NOT_NULL(dev_ctx, "Could not get valid device from forward op"); - auto& var_t = var->Get(); - grad_var->GetMutable()->mutable_data( - var_t.dims(), dev_ctx->GetPlace()); - operators::math::set_constant( - *dev_ctx, grad_var->GetMutable(), 0.0); + + if (var->grads_ == nullptr) { + auto& var_t = var->var_->Get(); + var->grads_ = new VarBase(var->GradName(), framework::proto::VarType::FP32, + framework::vectorize(var_t.dims()), + dev_ctx->GetPlace(), true, false); + auto grad_t = var->grads_->var_->GetMutable(); + operators::math::set_constant(*dev_ctx, grad_t, 0.0); + } } platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { @@ -85,92 +77,135 @@ platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { return result; } -Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { - if (!FLAGS_tracer_profile_fname.empty()) { - std::call_once(gTracerProfileOnce, [] { -#ifdef WITH_GPERFTOOLS - ProfilerStart(FLAGS_tracer_profile_fname.c_str()); - gTracerProfilerStarted = true; -#else - LOG(WARNING) << "Paddle is not compiled with gperftools. " - "FLAGS_tracer_profile_fname will be ignored"; -#endif - }); +framework::VariableNameMap CreateInputVarNameMap( + const OpBase* op, const VarBasePtrMap& varbase_map) { + framework::VariableNameMap result; + + auto& info_map = framework::OpInfoMap::Instance(); + auto* op_info = info_map.GetNullable(op->Type()); + if (op_info == nullptr || op_info->proto_ == nullptr) { + return result; } -} -std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, - const VarBasePtrMap& outputs, - framework::BlockDesc* block, - const platform::Place expected_place, - const bool stop_gradient) { -#ifdef WITH_GPERFTOOLS - if (gTracerProfilerStarted) { - ProfilerFlush(); + for (auto& in : op_info->Proto().inputs()) { + auto it = varbase_map.find(in.name()); + if (it == varbase_map.end()) { + PADDLE_ENFORCE(in.dispensable()); + result[in.name()] = {}; + } else { + auto var_vector = it->second; + std::vector args; + args.reserve(var_vector.size()); + for (VarBase* var_base : var_vector) { + args.emplace_back(var_base->Name()); + } + result[in.name()] = args; + } } -#endif + return result; +} - std::map vars; +framework::VariableNameMap CreateOutputVarNameMap( + const OpBase* op, const VarBasePtrMap& varbase_map) { + framework::VariableNameMap result; - framework::OpDesc* op_desc = op->op_desc_; - VLOG(3) << "tracer tracing " << op_desc->Type() << " trace id " - << op->trace_id_; - op_desc->InferShape(*block); - op_desc->InferVarType(block); + auto& info_map = framework::OpInfoMap::Instance(); + auto* op_info = info_map.GetNullable(op->Type()); + if (op_info == nullptr || op_info->proto_ == nullptr) { + return result; + } - std::unique_ptr op_base = - framework::OpRegistry::CreateOp(*op_desc); + for (auto& out : op_info->Proto().outputs()) { + auto it = varbase_map.find(out.name()); + if (it == varbase_map.end()) { + PADDLE_ENFORCE(out.dispensable()); + result[out.name()] = {}; + } else { + auto var_vector = it->second; + std::vector args; + args.reserve(var_vector.size()); + for (VarBase* var_base : var_vector) { + args.emplace_back(var_base->Name()); + } + result[out.name()] = args; + } + } + return result; +} + +Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {} +std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, + VarBasePtrMap* outputs, + framework::AttributeMap attrs_map, + const platform::Place expected_place, + const bool stop_gradient) { framework::VariableValueMap invars_map; framework::VariableValueMap outvars_map; + // Construct input_vars_map and output_vars_map + std::map current_vars_map; op->input_vars_ = inputs; for (auto it : op->input_vars_) { auto& invars = invars_map[it.first]; invars.reserve(it.second.size()); for (VarBase* inp : it.second) { - PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", - op->op_desc_->Type(), inp->var_desc_->Name()); + PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", op->Type(), + inp->Name()); invars.emplace_back(inp->var_); - vars[inp->var_desc_->Name()] = inp; - if (inp->PreOp() && !inp->IsStopGradient()) { - op->pre_ops_[it.first].push_back(inp->PreOp()); - op->pre_ops_out_idx_[it.first].push_back(inp->PreOpOutIdx()); - VLOG(3) << "add pre op " << inp->PreOp()->op_desc_->Type(); - } else { - op->pre_ops_[it.first].push_back(nullptr); + if (!stop_gradient) { + current_vars_map[inp->Name()] = inp; } - VLOG(3) << "input vname " << inp->var_desc_->Name() << " " - << inp->var_->IsInitialized() << " stop_gradient " - << inp->IsStopGradient(); + VLOG(3) << "input var name: " << inp->Name() + << " inited: " << inp->var_->IsInitialized() + << " stop_grad: " << inp->IsStopGradient(); } + op->TrackPreOp(it.first, it.second); } - op->output_vars_ = outputs; + op->output_vars_ = *outputs; for (auto it : op->output_vars_) { auto& outvars = outvars_map[it.first]; const std::vector& outputs = it.second; outvars.reserve(outputs.size()); - for (size_t i = 0; i < outputs.size(); ++i) { + for (size_t i = 0U; i < outputs.size(); ++i) { VarBase* out = outputs[i]; outvars.emplace_back(out->var_); - vars[out->var_desc_->Name()] = out; - - framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name()); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - out->var_->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; - } out->TrackPreOp(op, it.first, i, stop_gradient); + if (!stop_gradient) { + current_vars_map[out->Name()] = out; + } - VLOG(3) << "output vname " << out->var_desc_->Name() << " " - << out->var_->IsInitialized(); + VLOG(3) << "input var name: " << out->Name() + << " inited: " << out->var_->IsInitialized() + << " stop_grad: " << out->IsStopGradient(); } } - VLOG(3) << "tracer running " << op_desc->Type(); + // Check attrs and create op + framework::VariableNameMap invars_name_map = + CreateInputVarNameMap(op, inputs); + framework::VariableNameMap outvars_name_map = + CreateOutputVarNameMap(op, *outputs); + + auto& info = framework::OpInfoMap::Instance().Get(op->Type()); + if (info.Checker() != nullptr) { + info.Checker()->Check(&attrs_map); + } + + std::unique_ptr op_base = + framework::OpRegistry::CreateOp(op->Type(), invars_name_map, + outvars_name_map, attrs_map); + + if (info.infer_var_type_) { + RuntimeInferVarTypeContext infer_var_type_ctx(&inputs, outputs, &attrs_map); + info.infer_var_type_(&infer_var_type_ctx); + } + + // TODO(minqiyang): Support infer var type in imperative mode + // Run forward op + VLOG(3) << "tracer running " << op->Type(); framework::RuntimeContext ctx(invars_map, outvars_map); // TODO(panyx0718): Cache p. @@ -186,36 +221,45 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx, prepared_op.kernel_configs)); + // construct backward op std::set vars_saved_for_backward; - if (!stop_gradient) { + VLOG(5) << "start construct backward op"; + + // construct grad op descs + op->attrs_ = attrs_map; + std::unique_ptr fwd_op_desc(new framework::OpDesc( + op->Type(), invars_name_map, outvars_name_map, attrs_map)); std::unique_ptr> grad_to_var( new std::unordered_map()); - CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get()); + // NOTE(minqiyang): We don't support control flow op in imperative now + // Add grad_block_ when we want to support it + CreateGradOp(*fwd_op_desc, {}, {}, &op->grad_op_descs_, grad_to_var.get()); + + VLOG(5) << "create grad op desc: " << op->grad_op_descs_[0]->Type(); - op->grad_input_vars_.resize(op->grad_op_descs_.size()); - op->grad_output_vars_.resize(op->grad_op_descs_.size()); + const size_t grad_op_count = op->grad_op_descs_.size(); - for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) { + op->grad_input_vars_.resize(grad_op_count); + op->grad_output_vars_.resize(grad_op_count); + + for (size_t i = 0; i < grad_op_count; ++i) { framework::OpDesc* grad_op_desc = op->grad_op_descs_[i]; for (auto it : grad_op_desc->Inputs()) { auto& grad_in_vars = op->grad_input_vars_[i][it.first]; + grad_in_vars.reserve(it.second.size()); for (const std::string& grad_invar : it.second) { - block->FindRecursiveOrCreateVar(grad_invar); auto var_it = grad_to_var->find(grad_invar); if (var_it == grad_to_var->end()) { - auto fwd_var_it = vars.find(grad_invar); - PADDLE_ENFORCE(fwd_var_it != vars.end()); + auto fwd_var_it = current_vars_map.find(grad_invar); + PADDLE_ENFORCE(fwd_var_it != current_vars_map.end()); // Forward inputs or outputs. - grad_in_vars.push_back(fwd_var_it->second->var_); + grad_in_vars.emplace_back(fwd_var_it->second); } else { - VarBase* var = vars[var_it->second]; - if (!var->grads_->var_->IsInitialized()) { - InitVar(var->var_, var->grads_->var_, - prepared_op.GetDeviceContext()); - } + VarBase* var = current_vars_map[var_it->second]; + InitGrad(var, prepared_op.GetDeviceContext()); // Douts. - grad_in_vars.push_back(var->grads_->var_); + grad_in_vars.emplace_back(var->grads_); } vars_saved_for_backward.insert(it.first); @@ -225,48 +269,47 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, for (auto it : grad_op_desc->Outputs()) { auto& grad_out_vars = op->grad_output_vars_[i][it.first]; for (const std::string& grad_outvar : it.second) { - block->FindRecursiveOrCreateVar(grad_outvar); auto var_it = grad_to_var->find(grad_outvar); PADDLE_ENFORCE(var_it != grad_to_var->end(), "Could not found the grad op output var, should this " "operator %s's stop gradient be True", - op_desc->Type()); - VarBase* var = vars[var_it->second]; - if (!var->grads_->var_->IsInitialized()) { - InitVar(var->var_, var->grads_->var_, - prepared_op.GetDeviceContext()); - } - grad_out_vars.push_back(var->grads_->var_); + op->Type()); + VarBase* var = current_vars_map[var_it->second]; + InitGrad(var, prepared_op.GetDeviceContext()); + grad_out_vars.push_back(var->grads_); + VLOG(3) << "grads output var name: " << var->name_; } } } } - op->block_ = block; return vars_saved_for_backward; } std::vector Tracer::PyTrace(OpBase* op, const std::vector& inputs, bool stop_gradient) { - VLOG(3) << "py_trace"; + VLOG(3) << "py_trace " << op->Type(); + op->input_vars_[PyLayer::kFwdInp] = inputs; - op->output_vars_[PyLayer::kFwdOut] = PyLayer::Apply(op->forward_id_, inputs); - for (VarBase* inp : inputs) { - if (inp->PreOp() && !inp->IsStopGradient()) { - op->pre_ops_[PyLayer::kFwdInp].push_back(inp->PreOp()); - op->pre_ops_out_idx_[PyLayer::kFwdInp].push_back(inp->PreOpOutIdx()); - } else { - op->pre_ops_[PyLayer::kFwdInp].push_back(nullptr); - } - } - auto& outputs = op->output_vars_[PyLayer::kFwdOut]; - for (size_t i = 0; i < outputs.size(); ++i) { - VarBase* out = outputs[i]; + std::vector ret_vars = + PyLayer::Apply(op->forward_id_, inputs); + + op->TrackPreOp(PyLayer::kFwdInp, inputs); + + std::vector& outputs = op->output_vars_[PyLayer::kFwdOut]; + outputs.reserve(ret_vars.size()); + for (size_t i = 0U; i != ret_vars.size(); ++i) { + framework::Variable* v = ret_vars[i]; + VarBase* out = new VarBase(string::Sprintf("%s_out_%d", op->Type(), i), v, + nullptr, stop_gradient); + outputs.emplace_back(out); out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient); } + if (!stop_gradient) { + VLOG(5) << "start construct backward op"; op->grad_input_vars_.resize(1); op->grad_output_vars_.resize(1); auto& grad_input_vars = @@ -274,30 +317,23 @@ std::vector Tracer::PyTrace(OpBase* op, auto& grad_output_vars = op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)]; - for (const VarBase* inp : inputs) { - grad_input_vars.push_back(inp->var_); + for (VarBase* inp : inputs) { + grad_input_vars.push_back(inp); } for (VarBase* out : outputs) { - grad_input_vars.push_back(out->var_); + grad_input_vars.push_back(out); } + // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now platform::CPUPlace place; for (VarBase* out : outputs) { - grad_input_vars.push_back(out->grads_->var_); - if (!grad_input_vars.back()->IsInitialized()) { - // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now - InitVar(out->var_, grad_input_vars.back(), - platform::DeviceContextPool::Instance().Get(place)); - } + InitGrad(out, platform::DeviceContextPool::Instance().Get(place)); + grad_input_vars.push_back(out->grads_); } - for (const VarBase* inp : inputs) { - grad_output_vars.push_back(inp->grads_->var_); - if (!grad_output_vars.back()->IsInitialized()) { - // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now - InitVar(inp->var_, grad_output_vars.back(), - platform::DeviceContextPool::Instance().Get(place)); - } + for (VarBase* inp : inputs) { + InitGrad(inp, platform::DeviceContextPool::Instance().Get(place)); + grad_output_vars.push_back(inp->grads_); } } return outputs; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 8a0267c37f7c98a172fe0fa573955dc420952c0a..a87f3b8009dd552626c6c03fba3b0bbf3a78bb83 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include "paddle/fluid/framework/op_desc.h" @@ -34,7 +36,8 @@ void CreateGradOp(const framework::OpDesc& op_desc, framework::OpDesc** grad_op_desc, std::unordered_map* grad_to_var); -void InitVar(framework::Variable* var, framework::Variable* grad_var); +void InitVar(const VarBase* var, framework::Variable* grad_var, + platform::DeviceContext* dev_ctx); platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs); @@ -45,8 +48,8 @@ class Tracer { virtual ~Tracer() {} std::set Trace(OpBase* op, const VarBasePtrMap& inputs, - const VarBasePtrMap& outputs, - framework::BlockDesc* block, + VarBasePtrMap* outputs, // NOLINT + framework::AttributeMap attrs_map, const platform::Place expected_place, const bool stop_gradient = false); diff --git a/paddle/fluid/imperative/type_defs.h b/paddle/fluid/imperative/type_defs.h index fc9e42f8d0e9996176a5cbab7d8c7cf08ddce1af..c51ce931defbc87231a2f8c6c07f99d9853fb283 100644 --- a/paddle/fluid/imperative/type_defs.h +++ b/paddle/fluid/imperative/type_defs.h @@ -25,6 +25,7 @@ class VarBase; class OpBase; typedef std::map> VarBasePtrMap; +typedef std::map> ConstVarBasePtrMap; typedef std::map> OpBasePtrMap; } // namespace imperative diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 762640d6d1ce12dff511fc7149e872efa834036c..fb433ff2a2bd113358152248120d0d2be94bd927 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -16,7 +16,10 @@ add_subdirectory(utils) if (TENSORRT_FOUND) add_subdirectory(tensorrt) endif() -# add_subdirectory(anakin) + +if (ANAKIN_FOUND) + add_subdirectory(anakin) +endif() get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES) @@ -34,18 +37,29 @@ endif(WIN32) add_subdirectory(api) +if(WITH_MKLDNN) + set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/api/mkldnn_quantizer.cc) + set(mkldnn_quantizer_cfg mkldnn_quantizer_config) +endif() + set(STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor) set(SHARED_INFERENCE_SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc + ${mkldnn_quantizer_src} ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc) +# FIXME(gongwb): hidden libdgc.a +if(WITH_GPU AND NOT WIN32) + set(fluid_modules ${fluid_modules} dgc) +endif() + if(WIN32) sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array - analysis_config paddle_pass_builder) + analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) else(WIN32) cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} - zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder) + zero_copy_tensor reset_tensor_array analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) endif(WIN32) if(NOT APPLE) @@ -58,11 +72,11 @@ endif() if(WIN32) sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array - analysis_config paddle_pass_builder) + analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) else(WIN32) cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array - analysis_config paddle_pass_builder) + analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) endif() get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(paddle_fluid_shared ${os_dependency_modules}) @@ -91,5 +105,5 @@ if(WITH_TESTING) add_subdirectory(tests/book) if(WITH_INFERENCE_API_TEST) add_subdirectory(tests/api) - endif() + endif() endif() diff --git a/paddle/fluid/inference/anakin/CMakeLists.txt b/paddle/fluid/inference/anakin/CMakeLists.txt index b418af62f8cae4513bcca24f057d1fe100bbea25..e8fb56590563f49f920bfe71d160ec822cb3ca30 100644 --- a/paddle/fluid/inference/anakin/CMakeLists.txt +++ b/paddle/fluid/inference/anakin/CMakeLists.txt @@ -1,4 +1,5 @@ -cc_library(anakin_engine SRCS engine.cc) +cc_library(anakin_engine SRCS engine.cc DEPS framework_proto) +cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto) target_link_libraries(anakin_engine anakin anakin_saber_common) cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine) add_subdirectory(convert) diff --git a/paddle/fluid/inference/anakin/convert/CMakeLists.txt b/paddle/fluid/inference/anakin/convert/CMakeLists.txt index f5bfee861f14877b5a67bc48aeb14b8213a27370..1e7f5ac799de0d7a1debec0529d262f021bba790 100644 --- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt +++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt @@ -1,2 +1,19 @@ -cc_library(anakin_op_converter SRCS fc.cc registrar.cc DEPS anakin_engine framework_proto scope) -cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op) +cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc + elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry) + +cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL) +cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL) +cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter SERIAL) +cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling SERIAL) +cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split SERIAL) +cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split SERIAL) +cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op elementwise_mul_op SERIAL) +cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter SERIAL SERIAL) +cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax SERIAL) +cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op SERIAL) +cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op SERIAL) +cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op SERIAL) +cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op SERIAL) +cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op SERIAL) +#cc_test(test_anakin_im2sequence SRCS test_im2sequence_op.cc DEPS anakin_op_converter im2sequence_op im2col) +cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS anakin_op_converter sum_op selected_rows_functor SERIAL) diff --git a/paddle/fluid/inference/anakin/convert/activation.cc b/paddle/fluid/inference/anakin/convert/activation.cc new file mode 100644 index 0000000000000000000000000000000000000000..c85b958d7b85cb3e21df8714c89eee10b9b3fecc --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/activation.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/activation.h" +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::saber::NV; +using anakin::saber::Shape; + +namespace paddle { +namespace inference { +namespace anakin { + +ActivationOpConverter::ActivationOpConverter(const std::string &op_type) + : op_type_(op_type) { + auto it = anakin_op_types_.find(op_type_); + PADDLE_ENFORCE(it != anakin_op_types_.end(), + "activation op type is not support"); + anakin_op_type_ = it->second; +} + +void ActivationOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + auto input_name = op_desc.Input("X").front(); + auto output_name = op_desc.Output("Out").front(); + engine_->AddOp(op_name, "Activation", {input_name}, {output_name}); + engine_->AddOpAttr(op_name, "type", anakin_op_type_); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(sigmoid, SigmoidOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/activation.h b/paddle/fluid/inference/anakin/convert/activation.h new file mode 100644 index 0000000000000000000000000000000000000000..49a4518bef418491a7fbc0bcde403bf047f774bd --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/activation.h @@ -0,0 +1,52 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class ActivationOpConverter : public AnakinOpConverter { + public: + explicit ActivationOpConverter(const std::string &op_type); + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~ActivationOpConverter() {} + + private: + std::string op_type_; + std::string anakin_op_type_; + std::map anakin_op_types_{{"tanh", "TanH"}, + {"sigmoid", "Sigmoid"}}; +}; + +class TanhOpConverter : public ActivationOpConverter { + public: + TanhOpConverter() : ActivationOpConverter("tanh") {} +}; + +class SigmoidOpConverter : public ActivationOpConverter { + public: + SigmoidOpConverter() : ActivationOpConverter("sigmoid") {} +}; +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.cc b/paddle/fluid/inference/anakin/convert/batch_norm.cc new file mode 100644 index 0000000000000000000000000000000000000000..94014802bdbe1792e9eaba28d7134624dd3edc90 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/batch_norm.h" +#include +#include +#include +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::saber::NV; +using anakin::saber::Shape; + +namespace paddle { +namespace inference { +namespace anakin { + +void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1); + std::map inputs; + for (auto k : {"X", "Scale", "Bias", "Mean", "Variance"}) { + PADDLE_ENFORCE_EQ(op_desc.Input(k).size(), 1UL); + auto v = op_desc.Input(k).front(); + inputs.insert({k, v}); + } + + auto output = op_desc.Output("Y").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front(); + auto epsilon = boost::get(op_desc.GetAttr("epsilon")); + // auto momentum = boost::get(op_desc.GetAttr("momentum")); + + auto bn_op_name = op_name + ":bn"; + auto bn_output = bn_op_name + "_output"; + engine_->AddOp(bn_op_name, "BatchNorm", {inputs["X"]}, {bn_output}); + engine_->AddOpAttr(bn_op_name, "epsilon", epsilon); + engine_->AddOpAttr(bn_op_name, "momentum", static_cast(1.0)); + + auto scale_op_name = op_name + ":scale"; + auto get_lod_tensor = [this, &scope, &op_name](const std::string &var_name, + framework::LoDTensor *tensor) { + auto *v = scope.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(v); + auto *t = v->GetMutable(); + tensor->Resize(t->dims()); + TensorCopySync(*t, platform::CPUPlace(), tensor); + }; + + framework::LoDTensor bias_t; + framework::LoDTensor mean_t; + framework::LoDTensor scale_t; + framework::LoDTensor variance_t; + get_lod_tensor(inputs["Bias"], &bias_t); + get_lod_tensor(inputs["Mean"], &mean_t); + get_lod_tensor(inputs["Scale"], &scale_t); + get_lod_tensor(inputs["Variance"], &variance_t); + + auto fill_shape = [](size_t n, std::vector shape) { + shape.insert(shape.begin(), 1); + if (shape.size() < n) { + shape.insert(shape.end(), n - shape.size(), 1); + } + return shape; + }; + Shape shape1(fill_shape(4, framework::vectorize2int(mean_t.dims()))); + Shape shape2(fill_shape(4, framework::vectorize2int(variance_t.dims()))); + auto *weight1 = + GraphGlobalMem::Global().template new_block(shape1); + auto *mean_data = static_cast(weight1->h_tensor().mutable_data()); + std::copy_n(mean_t.data(), mean_t.numel(), mean_data); + engine_->AddOpAttr(bn_op_name, "weight_1", *weight1); + + auto *weight2 = + GraphGlobalMem::Global().template new_block(shape2); + auto *variance_data = + static_cast(weight2->h_tensor().mutable_data()); + std::copy_n(variance_t.data(), variance_t.numel(), variance_data); + engine_->AddOpAttr(bn_op_name, "weight_2", *weight2); + + Shape shape3(std::vector({1, 1, 1, 1})); + auto *weight3 = + GraphGlobalMem::Global().template new_block(shape3); + auto *alpha_data = static_cast(weight3->h_tensor().mutable_data()); + float weight3_data[] = {1}; + std::copy(std::begin(weight3_data), std::end(weight3_data), alpha_data); + engine_->AddOpAttr(bn_op_name, "weight_3", *weight3); + + Shape scale_shape(fill_shape(4, framework::vectorize2int(scale_t.dims()))); + auto *scale = + GraphGlobalMem::Global().template new_block(scale_shape); + auto *scale_data = static_cast(scale->h_tensor().mutable_data()); + std::copy_n(scale_t.data(), scale_t.numel(), scale_data); + + Shape bias_shape(fill_shape(4, framework::vectorize2int(bias_t.dims()))); + auto *bias = + GraphGlobalMem::Global().template new_block(bias_shape); + auto *bias_data = static_cast(bias->h_tensor().mutable_data()); + std::copy_n(bias_t.data(), bias_t.numel(), bias_data); + + engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output}); + engine_->AddOpAttr(scale_op_name, "axis", 1); + engine_->AddOpAttr(scale_op_name, "num_axes", 1); + engine_->AddOpAttr(scale_op_name, "bias_term", true); + engine_->AddOpAttr(scale_op_name, "weight_1", *scale); + engine_->AddOpAttr(scale_op_name, "weight_2", *bias); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(batch_norm, BatchNormOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.h b/paddle/fluid/inference/anakin/convert/batch_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..cee5c43ae76bf28284118380ca4c861d5cbedd1c --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/batch_norm.h @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class BatchNormOpConverter : public AnakinOpConverter { + public: + BatchNormOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~BatchNormOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/concat.cc b/paddle/fluid/inference/anakin/convert/concat.cc new file mode 100644 index 0000000000000000000000000000000000000000..e2d1111acbb60690167530a25aeaf59858b71987 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/concat.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/concat.h" +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::Precision; +using anakin::saber::NV; +using anakin::saber::X86; +using anakin::saber::Shape; +using anakin::PBlock; +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +void ConcatOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + int axis = boost::get(op_desc.GetAttr("axis")); + auto input_names = op_desc.Input("X"); + // PADDLE_ENFORCE(axis > 0, + // "The axis attr of Concat op should be large than 0 for trt"); + + auto y_name = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + engine_->AddOp(op_name, "Concat", input_names, {y_name}); + engine_->AddOpAttr(op_name, "axis", axis); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(concat, ConcatOpConverter); diff --git a/paddle/fluid/inference/utils/visualizer.h b/paddle/fluid/inference/anakin/convert/concat.h similarity index 65% rename from paddle/fluid/inference/utils/visualizer.h rename to paddle/fluid/inference/anakin/convert/concat.h index be532f92cf60e06094bfcf8cc2be85085795fcf4..4ff2b6d85b758efc7529c5034a34e094ee06cccb 100644 --- a/paddle/fluid/inference/utils/visualizer.h +++ b/paddle/fluid/inference/anakin/convert/concat.h @@ -14,29 +14,24 @@ #pragma once -#include -#include "paddle/fluid/inference/analysis/argument.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" namespace paddle { namespace inference { -namespace utils { +namespace anakin { -using paddle::inference::analysis::Argument; - -class Visualizer final { +class ConcatOpConverter : public AnakinOpConverter { public: - Visualizer() = default; - ~Visualizer() = default; - Visualizer(const Visualizer &) = delete; - Visualizer &operator=(const Visualizer &) = delete; + ConcatOpConverter() = default; - void SetArgument(Argument *); - bool Run(); + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~ConcatOpConverter() {} private: - Argument *argument_; }; -} // namespace utils +} // namespace anakin } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/conv2d.cc b/paddle/fluid/inference/anakin/convert/conv2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..b99c6e71c4dfd2b567d85904f57ebecf0ed9a1cc --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/conv2d.cc @@ -0,0 +1,87 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/conv2d.h" +#include +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::saber::NV; +using anakin::saber::Shape; +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL); + PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL); + PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL); + + auto input_name = op_desc.Input("Input").front(); + auto output_name = op_desc.Output("Output").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front(); + engine_->AddOp(op_name, "Convolution", {input_name}, {output_name}); + + auto *filter_v = scope.FindVar(op_desc.Input("Filter").front()); + PADDLE_ENFORCE_NOT_NULL(filter_v); + auto *filter_t = filter_v->GetMutable(); + std::unique_ptr weight_tensor( + new framework::LoDTensor()); + weight_tensor->Resize(filter_t->dims()); + TensorCopySync((*filter_t), platform::CPUPlace(), weight_tensor.get()); + + PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); + + // const int n_output = weight_tensor->dims()[0]; + // const int n_input = weight_tensor->dims()[1]; + const int filter_h = weight_tensor->dims()[2]; + const int filter_w = weight_tensor->dims()[3]; + // auto filter_num = n_input * filter_h * filter_w ; + auto filter_num = weight_tensor->dims()[0]; + engine_->AddOpAttr(op_name, "filter_num", filter_num); + engine_->AddOpAttr>(op_name, "kernel_size", {filter_h, filter_w}); + auto strides = boost::get>(op_desc.GetAttr("strides")); + engine_->AddOpAttr>(op_name, "strides", strides); + auto paddings = boost::get>(op_desc.GetAttr("paddings")); + engine_->AddOpAttr>(op_name, "padding", paddings); + auto dilations = boost::get>(op_desc.GetAttr("dilations")); + engine_->AddOpAttr>(op_name, "dilation_rate", dilations); + const int groups = boost::get(op_desc.GetAttr("groups")); + engine_->AddOpAttr(op_name, "group", groups); + engine_->AddOpAttr(op_name, "axis", 1); + engine_->AddOpAttr(op_name, "bias_term", false); + + auto weight_shape = framework::vectorize2int(filter_t->dims()); + Shape anakin_shape(weight_shape); + auto *weight1 = + GraphGlobalMem::Global().template new_block(anakin_shape); + float *cpu_data = static_cast(weight1->h_tensor().mutable_data()); + std::copy_n(weight_tensor->data(), weight_tensor->numel(), cpu_data); + weight1->d_tensor().set_shape(anakin_shape); + weight1->d_tensor().copy_from(weight1->h_tensor()); + engine_->AddOpAttr(op_name, "weight_1", *weight1); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(conv2d, Conv2dOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/registrar.cc b/paddle/fluid/inference/anakin/convert/conv2d.h similarity index 66% rename from paddle/fluid/inference/anakin/convert/registrar.cc rename to paddle/fluid/inference/anakin/convert/conv2d.h index 701ebdb2d43cf524330f946ac56d32dfa884f42a..75a30c10d481762fe5579ccb4d79feeba73dc98a 100644 --- a/paddle/fluid/inference/anakin/convert/registrar.cc +++ b/paddle/fluid/inference/anakin/convert/conv2d.h @@ -12,22 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/anakin/convert/registrar.h" +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" namespace paddle { namespace inference { namespace anakin { -std::shared_ptr OpRegister::Get(const std::string &name) { - auto it = registry_.find(name); - if (it == registry_.end()) return nullptr; - return it->second(); -} +class Conv2dOpConverter : public AnakinOpConverter { + public: + Conv2dOpConverter() = default; -OpRegister *OpRegister::instance() { - static OpRegister factory; - return &factory; -} + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~Conv2dOpConverter() {} +}; } // namespace anakin } // namespace inference diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc new file mode 100644 index 0000000000000000000000000000000000000000..4d105430dd298076fa8aa4c1925329c3a0e356a1 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/conv2d_fusion.h" +#include +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::saber::NV; +using anakin::saber::Shape; +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL); + PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL); + PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1UL); + PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL); + + auto input_name = op_desc.Input("Input").front(); + auto output_name = op_desc.Output("Output").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front(); + engine_->AddOp(op_name, "Convolution", {input_name}, {output_name}); + + auto *filter_v = scope.FindVar(op_desc.Input("Filter").front()); + PADDLE_ENFORCE_NOT_NULL(filter_v); + auto *filter_t = filter_v->GetMutable(); + + auto *b_v = scope.FindVar(op_desc.Input("Bias").front()); + PADDLE_ENFORCE_NOT_NULL(b_v); + auto *b_t = b_v->GetMutable(); + + std::unique_ptr weight_tensor( + new framework::LoDTensor()); + weight_tensor->Resize(filter_t->dims()); + TensorCopySync((*filter_t), platform::CPUPlace(), weight_tensor.get()); + + PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); + + // const int n_output = weight_tensor->dims()[0]; + // const int n_input = weight_tensor->dims()[1]; + const int filter_h = weight_tensor->dims()[2]; + const int filter_w = weight_tensor->dims()[3]; + // auto filter_num = n_input * filter_h * filter_w ; + auto filter_num = weight_tensor->dims()[0]; + engine_->AddOpAttr(op_name, "filter_num", filter_num); + engine_->AddOpAttr>(op_name, "kernel_size", {filter_h, filter_w}); + auto strides = boost::get>(op_desc.GetAttr("strides")); + engine_->AddOpAttr>(op_name, "strides", strides); + auto paddings = boost::get>(op_desc.GetAttr("paddings")); + engine_->AddOpAttr>(op_name, "padding", paddings); + auto dilations = boost::get>(op_desc.GetAttr("dilations")); + engine_->AddOpAttr>(op_name, "dilation_rate", dilations); + const int groups = boost::get(op_desc.GetAttr("groups")); + engine_->AddOpAttr(op_name, "group", groups); + engine_->AddOpAttr(op_name, "axis", 1); + engine_->AddOpAttr(op_name, "bias_term", true); + + auto weight_shape = framework::vectorize2int(filter_t->dims()); + Shape anakin_shape(weight_shape); + auto *weight1 = + GraphGlobalMem::Global().template new_block(anakin_shape); + float *cpu_data = static_cast(weight1->h_tensor().mutable_data()); + std::copy_n(weight_tensor->data(), weight_tensor->numel(), cpu_data); + weight1->d_tensor().set_shape(anakin_shape); + weight1->d_tensor().copy_from(weight1->h_tensor()); + engine_->AddOpAttr(op_name, "weight_1", *weight1); + + auto bias_shape = framework::vectorize2int(b_t->dims()); + framework::LoDTensor bias_tensor; + bias_tensor.Resize(b_t->dims()); + TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor); + auto *bias_data = bias_tensor.data(); + bias_shape.insert(bias_shape.begin(), 1); + bias_shape.insert(bias_shape.begin(), 1); + bias_shape.insert(bias_shape.begin(), 1); + // bias_shape.push_back(1); + // bias_shape.push_back(1); + Shape anakin_bias_shape(bias_shape); + + auto *weight2 = GraphGlobalMem::Global().template new_block( + anakin_bias_shape); + float *cpu_data2 = static_cast(weight2->h_tensor().mutable_data()); + std::copy_n(bias_data, bias_tensor.numel(), cpu_data2); + weight2->d_tensor().set_shape(anakin_bias_shape); + weight2->d_tensor().copy_from(weight2->h_tensor()); + engine_->AddOpAttr(op_name, "weight_2", *weight2); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(conv2d_fusion, Conv2dFusionOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h new file mode 100644 index 0000000000000000000000000000000000000000..07359b9cba05bf7c885eb38d64816bdb718a6aba --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class Conv2dFusionOpConverter : public AnakinOpConverter { + public: + Conv2dFusionOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~Conv2dFusionOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.cc b/paddle/fluid/inference/anakin/convert/density_prior_box.cc new file mode 100644 index 0000000000000000000000000000000000000000..a55c153f99a815c0e0092b69b8e181630aed16bf --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/density_prior_box.h" +#include +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::saber::NV; +using anakin::saber::Shape; +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + auto input_name = op_desc.Input("Input").front(); + auto image_name = op_desc.Input("Image").front(); + auto output_name = op_desc.Output("Boxes").front(); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Boxes").front(); + + auto fixed_sizes = + boost::get>(op_desc.GetAttr("fixed_sizes")); + auto fixed_ratios = + boost::get>(op_desc.GetAttr("fixed_ratios")); + auto densities = boost::get>(op_desc.GetAttr("densities")); + std::vector dens; + for (auto& ele : densities) { + dens.push_back(static_cast(ele)); + } + + // lack flip + // auto clip = boost::get(op_desc.GetAttr("clip")); + auto variances = boost::get>(op_desc.GetAttr("variances")); + for (auto& ele : variances) { + LOG(INFO) << ele; + } + + // lack img_h, img_w + auto step_h = boost::get(op_desc.GetAttr("step_h")); + auto step_w = boost::get(op_desc.GetAttr("step_w")); + auto offset = boost::get(op_desc.GetAttr("offset")); + PTuple t_order; + t_order.push_back("MIN"); + t_order.push_back("COM"); + t_order.push_back("MAX"); + + std::vector temp_v = {}; + + engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name}); + engine_->AddOpAttr>(op_name, "min_size", temp_v); + engine_->AddOpAttr>(op_name, "max_size", temp_v); + engine_->AddOpAttr>(op_name, "aspect_ratio", temp_v); + engine_->AddOpAttr>(op_name, "fixed_size", fixed_sizes); + engine_->AddOpAttr>(op_name, "fixed_ratio", fixed_ratios); + engine_->AddOpAttr>(op_name, "density", dens); + engine_->AddOpAttr(op_name, "is_flip", static_cast(false)); + engine_->AddOpAttr(op_name, "is_clip", static_cast(false)); + engine_->AddOpAttr>(op_name, "variance", variances); + engine_->AddOpAttr(op_name, "img_h", static_cast(0)); + engine_->AddOpAttr(op_name, "img_w", static_cast(0)); + engine_->AddOpAttr(op_name, "step_h", step_h); + engine_->AddOpAttr(op_name, "step_w", step_w); + engine_->AddOpAttr(op_name, "offset", offset); + engine_->AddOpAttr>(op_name, "order", t_order); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(density_prior_box, DensityPriorBoxOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.h b/paddle/fluid/inference/anakin/convert/density_prior_box.h new file mode 100644 index 0000000000000000000000000000000000000000..44265cbf2e968e8821bc1a9ae3225c9b7d405235 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/density_prior_box.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class DensityPriorBoxOpConverter : public AnakinOpConverter { + public: + DensityPriorBoxOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~DensityPriorBoxOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/detection_out.cc b/paddle/fluid/inference/anakin/convert/detection_out.cc new file mode 100644 index 0000000000000000000000000000000000000000..67636651017cfb18967cf8dc76d4f4a552fbd021 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/detection_out.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/detection_out.h" +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::saber::NV; +using anakin::saber::Shape; + +namespace paddle { +namespace inference { +namespace anakin { + +void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + auto target_name = op_desc.Input("TargetBox").front(); + auto prior_box_name = op_desc.Input("PriorBox").front(); + auto scores_name = op_desc.Input("Scores").front(); + auto output_name = op_desc.Output("Out").front(); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + auto code_type = boost::get(op_desc.GetAttr("code_type")); + auto background_label = boost::get(op_desc.GetAttr("background_label")); + auto score_threshold = boost::get(op_desc.GetAttr("score_threshold")); + auto nms_top_k = boost::get(op_desc.GetAttr("nms_top_k")); + auto nms_threshold = boost::get(op_desc.GetAttr("nms_threshold")); + auto nms_eta = boost::get(op_desc.GetAttr("nms_eta")); + auto keep_top_k = boost::get(op_desc.GetAttr("keep_top_k")); + std::string anakin_code_type; + if (code_type == "decode_center_size") { + anakin_code_type = "CENTER_SIZE"; + } else if (code_type == "encode_center_size") { + PADDLE_THROW( + "Not support encode_center_size code_type in DetectionOut of anakin"); + } + + engine_->AddOp(op_name, "DetectionOutput", + {target_name, scores_name, prior_box_name}, {output_name}); + engine_->AddOpAttr(op_name, "share_location", true); + engine_->AddOpAttr(op_name, "variance_encode_in_target", false); + engine_->AddOpAttr(op_name, "class_num", static_cast(0)); + engine_->AddOpAttr(op_name, "background_id", background_label); + engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k); + engine_->AddOpAttr(op_name, "code_type", anakin_code_type); + engine_->AddOpAttr(op_name, "conf_thresh", score_threshold); + engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k); + engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold); + engine_->AddOpAttr(op_name, "nms_eta", nms_eta); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(detection_out, DetectionOutOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/detection_out.h b/paddle/fluid/inference/anakin/convert/detection_out.h new file mode 100644 index 0000000000000000000000000000000000000000..5bf1c3ecbc89795d075301a2fd568312236bd874 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/detection_out.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class DetectionOutOpConverter : public AnakinOpConverter { + public: + DetectionOutOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~DetectionOutOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/dropout.cc b/paddle/fluid/inference/anakin/convert/dropout.cc new file mode 100644 index 0000000000000000000000000000000000000000..ed6d7f7561cb78666855146864b33254026926ef --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/dropout.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/dropout.h" +#include +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::Precision; +using anakin::saber::NV; +using anakin::saber::X86; +using anakin::saber::Shape; +using anakin::PBlock; +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +void DropoutOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Mask").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto x_name = op_desc.Input("X").front(); + auto out_name = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + engine_->AddOp(op_name, "Scale", {x_name}, {out_name}); + + auto dropout_prob = boost::get(op_desc.GetAttr("dropout_prob")); + auto factor = 1 - dropout_prob; + Shape shape1(std::vector({1, 1, 1, 1})); + auto *weight1 = + GraphGlobalMem::Global().template new_block(shape1); + auto *factor_data = static_cast(weight1->h_tensor().mutable_data()); + float weight1_data[] = {factor}; + std::copy(std::begin(weight1_data), std::end(weight1_data), factor_data); + + engine_->AddOpAttr(op_name, "weight_1", *weight1); + engine_->AddOpAttr(op_name, "axis", 0); + engine_->AddOpAttr(op_name, "num_axes", 0); + engine_->AddOpAttr(op_name, "bias_term", false); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(dropout, DropoutOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/dropout.h b/paddle/fluid/inference/anakin/convert/dropout.h new file mode 100644 index 0000000000000000000000000000000000000000..2a0fb6e76ac8354d884f9d815a4df785248e6475 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/dropout.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class DropoutOpConverter : public AnakinOpConverter { + public: + DropoutOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~DropoutOpConverter() {} + + private: +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/elementwise.cc b/paddle/fluid/inference/anakin/convert/elementwise.cc new file mode 100644 index 0000000000000000000000000000000000000000..55b12390baf90a9365fd4d197b19a3c5cd675afd --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/elementwise.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/elementwise.h" +#include +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::Precision; +using anakin::saber::NV; +using anakin::saber::X86; +using anakin::saber::Shape; +using anakin::PBlock; +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +void ElementwiseAddOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto x_name = op_desc.Input("X").front(); + auto y_name = op_desc.Input("Y").front(); + auto out_name = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name}); + std::string elementwise_type = "Add"; + engine_->AddOpAttr(op_name, "type", elementwise_type); + std::vector coeff = {1.0, 1.0}; + engine_->AddOpAttr>(op_name, "coeff", coeff); +} + +void ElementwiseMulOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto x_name = op_desc.Input("X").front(); + auto y_name = op_desc.Input("Y").front(); + auto out_name = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + engine_->AddOp(op_name, "Scale", {x_name, y_name}, {out_name}); + // Fill a number to weight_1 as a placeholder. + Shape shape1(std::vector({1, 1, 1, 1})); + auto *weight1 = + GraphGlobalMem::Global().template new_block(shape1); + auto *placeholder_data = + static_cast(weight1->h_tensor().mutable_data()); + float weight1_data[] = {1}; + std::copy(std::begin(weight1_data), std::end(weight1_data), placeholder_data); + engine_->AddOpAttr(op_name, "weight_1", *weight1); + + auto axis = boost::get(op_desc.GetAttr("axis")); + engine_->AddOpAttr(op_name, "axis", axis); + engine_->AddOpAttr(op_name, "num_axes", 1); + engine_->AddOpAttr(op_name, "bias_term", false); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(elementwise_add, ElementwiseAddOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(elementwise_mul, ElementwiseMulOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/elementwise.h b/paddle/fluid/inference/anakin/convert/elementwise.h new file mode 100644 index 0000000000000000000000000000000000000000..47525e41daafcbca0c7c86bad44066f18a3ac79c --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/elementwise.h @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class ElementwiseAddOpConverter : public AnakinOpConverter { + public: + ElementwiseAddOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~ElementwiseAddOpConverter() {} + + private: +}; + +class ElementwiseMulOpConverter : public AnakinOpConverter { + public: + ElementwiseMulOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~ElementwiseMulOpConverter() {} + + private: +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/fc.cc b/paddle/fluid/inference/anakin/convert/fc.cc index 33a5aff1de2851ad55c2df83cc48ba86f8ded754..2514eb1e093b4e05b7e6b2814cfd8185b3aede6c 100644 --- a/paddle/fluid/inference/anakin/convert/fc.cc +++ b/paddle/fluid/inference/anakin/convert/fc.cc @@ -14,60 +14,108 @@ #include "paddle/fluid/inference/anakin/convert/fc.h" #include +#include +#include using anakin::graph::GraphGlobalMem; using anakin::AK_FLOAT; -using anakin::Precision; using anakin::saber::NV; -using anakin::saber::X86; using anakin::saber::Shape; -using anakin::PBlock; -using anakin::PTuple; namespace paddle { namespace inference { namespace anakin { -void FcOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::Scope &scope, bool test_mode) { +void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { framework::OpDesc op_desc(op, nullptr); - PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); - PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); - PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + auto input_names = op_desc.InputNames(); + bool with_bias = input_names.size() == 3; + + std::string w_name = "Y"; + std::string i_name = "X"; + if (with_bias) { + w_name = "W"; + i_name = "Input"; + } - auto x_name = op_desc.Input("X").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); - auto *y_v = scope.FindVar(op_desc.Input("Y").front()); + + // get weights + auto *y_v = scope.FindVar(op_desc.Input(w_name).front()); PADDLE_ENFORCE_NOT_NULL(y_v); auto *y_t = y_v->GetMutable(); - auto input_name = op_desc.Input("X").front(); + auto input_name = op_desc.Input(i_name).front(); auto output_name = op_desc.Output("Out").front(); - auto weight_shape = framework::vectorize2int(y_t->dims()); engine_->AddOp(op_name, "Dense", {input_name}, {output_name}); - engine_->AddOpAttr(op_name, "bias_term", false); + engine_->AddOpAttr(op_name, "bias_term", with_bias); engine_->AddOpAttr(op_name, "axis", 1); + + auto weight_shape = framework::vectorize2int(y_t->dims()); int out_dim = weight_shape[1]; engine_->AddOpAttr(op_name, "out_dim", out_dim); + const int w_m = weight_shape[0]; + const int w_k = weight_shape[1]; - weight_shape.push_back(1); - weight_shape.push_back(1); + if (weight_shape.size() < 4UL) { + weight_shape.insert(weight_shape.begin(), 4UL - weight_shape.size(), 1); + } Shape anakin_shape(weight_shape); framework::LoDTensor weight_tensor; weight_tensor.Resize(y_t->dims()); TensorCopySync((*y_t), platform::CPUPlace(), &weight_tensor); + auto *weight_data = weight_tensor.data(); + PADDLE_ENFORCE(w_m * w_k == weight_tensor.numel()); + std::vector trans_weight_data(weight_tensor.numel()); + for (int i = 0; i < w_m; i++) { + for (int j = 0; j < w_k; j++) { + trans_weight_data[i + j * w_m] = weight_data[i * w_k + j]; + } + } auto *weight1 = GraphGlobalMem::Global().template new_block(anakin_shape); float *cpu_data = static_cast(weight1->h_tensor().mutable_data()); - std::copy_n(weight_tensor.data(), weight_tensor.numel(), cpu_data); + std::copy_n(trans_weight_data.data(), weight_tensor.numel(), cpu_data); weight1->d_tensor().set_shape(anakin_shape); weight1->d_tensor().copy_from(weight1->h_tensor()); engine_->AddOpAttr(op_name, "weight_1", *weight1); + + // get bias + if (with_bias) { + auto *b_v = scope.FindVar(op_desc.Input("Bias").front()); + PADDLE_ENFORCE_NOT_NULL(b_v); + auto *b_t = b_v->GetMutable(); + + auto bias_shape = framework::vectorize2int(b_t->dims()); + framework::LoDTensor bias_tensor; + bias_tensor.Resize(b_t->dims()); + TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor); + auto *bias_data = bias_tensor.data(); + bias_shape.insert(bias_shape.begin(), 1); + bias_shape.insert(bias_shape.begin(), 1); + bias_shape.insert(bias_shape.begin(), 1); + // bias_shape.push_back(1); + // bias_shape.push_back(1); + Shape anakin_bias_shape(bias_shape); + + auto *weight2 = GraphGlobalMem::Global().template new_block( + anakin_bias_shape); + float *cpu_data2 = static_cast(weight2->h_tensor().mutable_data()); + std::copy_n(bias_data, bias_tensor.numel(), cpu_data2); + weight2->d_tensor().set_shape(anakin_bias_shape); + weight2->d_tensor().copy_from(weight2->h_tensor()); + engine_->AddOpAttr(op_name, "weight_2", *weight2); + } } } // namespace anakin } // namespace inference } // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(mul, MulOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(fc, FcOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/fc.h b/paddle/fluid/inference/anakin/convert/fc.h index b670486f12b36043a01ceb002da8756901ed01ce..060c649b19ef335a9e926eb205ec691a2a188fe1 100644 --- a/paddle/fluid/inference/anakin/convert/fc.h +++ b/paddle/fluid/inference/anakin/convert/fc.h @@ -20,19 +20,28 @@ namespace paddle { namespace inference { namespace anakin { -class FcOpConverter : public AnakinOpConverter { +class FcBaseOpConverter : public AnakinOpConverter { public: - FcOpConverter() = default; + FcBaseOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, const framework::Scope &scope, bool test_mode) override; - virtual ~FcOpConverter() {} + virtual ~FcBaseOpConverter() {} +}; - private: +// with bias +class FcOpConverter : public FcBaseOpConverter { + public: + FcOpConverter() = default; +}; + +// without bias +class MulOpConverter : public FcBaseOpConverter { + public: + MulOpConverter() = default; }; -static Registrar register_fc_op_converter("fc"); } // namespace anakin } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/flatten.cc b/paddle/fluid/inference/anakin/convert/flatten.cc new file mode 100644 index 0000000000000000000000000000000000000000..c6c372bbef87de7f38c1f66a21c170cabac8c0ed --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/flatten.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/flatten.h" +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::saber::NV; +using anakin::saber::Shape; +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +void FlattenOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL); + + auto input = op_desc.Input("X").front(); + auto output = op_desc.Output("Out").front(); + int axis = boost::get(op_desc.GetAttr("axis")); + PADDLE_ENFORCE(axis == 1, + "the anakin flatten op converter now only support aixs == 1."); + + std::vector out_dims = {0, -1, 1, 1}; + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + engine_->AddOp(op_name, "Reshape", {input}, {output}); + engine_->AddOpAttr>(op_name, "dims", out_dims); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(flatten, FlattenOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/flatten.h b/paddle/fluid/inference/anakin/convert/flatten.h new file mode 100644 index 0000000000000000000000000000000000000000..1ace76b16381980a9eaec12806e0bc94d7b1fb85 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/flatten.h @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class FlattenOpConverter : public AnakinOpConverter { + public: + FlattenOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~FlattenOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/im2sequence.cc b/paddle/fluid/inference/anakin/convert/im2sequence.cc new file mode 100644 index 0000000000000000000000000000000000000000..568d7e4746f11b13ce8ea9e5a47a1b43d1c12693 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/im2sequence.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/im2sequence.h" +#include +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::Precision; +using anakin::saber::NV; +using anakin::saber::X86; +using anakin::saber::Shape; +using anakin::PBlock; +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 0); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto x_name = op_desc.Input("X").front(); + auto out_name = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + engine_->AddOp(op_name, "Im2Sequence", {x_name}, {out_name}); + + std::vector dilations = {1, 1}; + auto paddings = boost::get>(op_desc.GetAttr("paddings")); + auto strides = boost::get>(op_desc.GetAttr("strides")); + auto kernels = boost::get>(op_desc.GetAttr("kernels")); + + engine_->AddOpAttr>(op_name, "paddings", paddings); + engine_->AddOpAttr>(op_name, "strides", strides); + engine_->AddOpAttr>(op_name, "window_size", kernels); + engine_->AddOpAttr>(op_name, "dilations", dilations); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(im2sequence, Im2SequenceConverter); diff --git a/paddle/fluid/inference/anakin/convert/im2sequence.h b/paddle/fluid/inference/anakin/convert/im2sequence.h new file mode 100644 index 0000000000000000000000000000000000000000..3003eac2c6f416663c3e7c4c3e297b6347edfb47 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/im2sequence.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class Im2SequenceConverter : public AnakinOpConverter { + public: + Im2SequenceConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~Im2SequenceConverter() {} + + private: +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h index b9a221079dcec78fc86ebed7dfac0c59ec0f8540..4603681e1e8a3c2841a62cc88b49a84950910e73 100644 --- a/paddle/fluid/inference/anakin/convert/op_converter.h +++ b/paddle/fluid/inference/anakin/convert/op_converter.h @@ -14,15 +14,16 @@ #pragma once +#include #include #include #include #include +#include #include "framework/core/types.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/inference/anakin/convert/registrar.h" #include "paddle/fluid/inference/anakin/engine.h" #include "paddle/fluid/inference/utils/singleton.h" #include "saber/saber_types.h" @@ -46,19 +47,14 @@ class AnakinOpConverter { bool test_mode = false) { framework::OpDesc op_desc(op, nullptr); std::string op_type = op_desc.Type(); - std::shared_ptr it{nullptr}; - - if (op_type == "mul") { - PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL); - std::string Y = op_desc.Input("Y")[0]; - std::cout << Y << parameters.count(Y) << std::endl; - if (parameters.count(Y)) { - it = OpRegister::instance()->Get("fc"); - } - } + AnakinOpConverter *it = nullptr; + + if (op_type == "reshape2") op_type = "reshape"; + if (op_type == "transpose2") op_type = "transpose"; + if (op_type == "flatten2") op_type = "flatten"; if (!it) { - it = OpRegister::instance()->Get(op_type); + it = Registry::Global().Lookup(op_type); } PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_type); it->SetEngine(engine); @@ -74,6 +70,63 @@ class AnakinOpConverter { ConvertOp(op, parameters, scope, engine); } } + + // The scope here should be inited with the parameter vars. + void ConvertBlockToAnakinEngine( + framework::BlockDesc *block_desc, framework::Scope *scope, + const std::vector &inputs, + const std::unordered_set ¶meters, + const std::vector &outputs, AnakinNvEngine *engine) { + framework::proto::BlockDesc *block_proto = block_desc->Proto(); + ConvertBlock(*block_proto, parameters, *scope, engine); + + engine->Freeze(); + // if the max_batch size + int max_batch_size = engine->GetMaxBatchSize(); + PADDLE_ENFORCE(max_batch_size > 0, + "the max_batch_size setted from config->EnableAnakinEngine " + "must largger than 0"); + // If the user does not specify this variable, we use the input shape from + // the block_desc. + auto max_input_shape = engine->GetMaxInputShape(); + std::map> temp_max_input_shape; + + for (auto &input : inputs) { + if (parameters.count(input)) continue; + std::vector input_shape; + input_shape.resize(4); + input_shape[0] = max_batch_size; + if (max_input_shape.count(input)) { + PADDLE_ENFORCE(max_input_shape[input].size() == 4, + "the dimensions of max_input_shape setted from " + "config->EnableAnakinEngine must be 4"); + for (int i = 1; i < 4; i++) { + input_shape[i] = max_input_shape[input][i]; + } + } else { + auto *var = block_desc->FindVar(input); + PADDLE_ENFORCE(var, "no variable called %s", input); + + auto var_shape = var->GetShape(); + std::cout << "input :" << input << std::endl; + PADDLE_ENFORCE(var_shape.size() == 4); + + for (size_t i = 1; i < var_shape.size(); i++) { + input_shape[i] = var_shape[i]; + } + } + temp_max_input_shape[input] = input_shape; + engine->SetInputShape(input, input_shape); + engine->Graph()->RegistVar(input); // For share from data. + } + engine->SetMaxInputShape(temp_max_input_shape); + engine->Optimize(); + + // For anakin share with fluid tensor. + engine->AllocTmpMem(); + engine->InitGraph(); + } + void SetEngine(AnakinNvEngine *engine) { engine_ = engine; } virtual ~AnakinOpConverter() {} @@ -91,22 +144,23 @@ class AnakinOpConverter { } // namespace inference } // namespace paddle -#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ - struct anakin_##op_type__##_converter \ - : public ::paddle::framework::Registrar { \ - anakin_##op_type__##_converter() { \ - ::paddle::inference:: \ - Registry::Register< \ - ::paddle::inference::anakin::Converter__>(#op_type__); \ - } \ - }; \ - anakin_##op_type__##_converter anakin_##op_type__##_converter__; \ - int TouchConverterRegister_anakin_##op_type__() { \ - anakin_##op_type__##_converter__.Touch(); \ - return 0; \ +#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ + struct anakin_##op_type__##_converter \ + : public ::paddle::framework::Registrar { \ + anakin_##op_type__##_converter() { \ + LOG(INFO) << "register convert " << #op_type__; \ + ::paddle::inference::Registry< \ + ::paddle::inference::anakin::AnakinOpConverter>::Global() \ + .Register<::paddle::inference::anakin::Converter__>(#op_type__); \ + } \ + }; \ + anakin_##op_type__##_converter anakin_##op_type__##_converter__; \ + int TouchConverterRegister_anakin_##op_type__() { \ + anakin_##op_type__##_converter__.Touch(); \ + return 0; \ } -#define USE_ANAKIN_CONVERTER(op_type__) \ - extern int TouchConverterRegister_anakin_##op_type__(); \ - static int use_op_converter_anakin_##op_type__ __attribute__((unused)) = \ +#define USE_ANAKIN_CONVERTER(op_type__) \ + extern int TouchConverterRegister_anakin_##op_type__(); \ + int use_op_converter_anakin_##op_type__ __attribute__((unused)) = \ TouchConverterRegister_anakin_##op_type__(); diff --git a/paddle/fluid/inference/anakin/convert/pool2d.cc b/paddle/fluid/inference/anakin/convert/pool2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..9b01d56a126b2ebc194f5b5bb5b2f52c298a316e --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/pool2d.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/pool2d.h" +#include +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::Precision; +using anakin::saber::NV; +using anakin::saber::X86; +using anakin::saber::Shape; +using anakin::PBlock; +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto x_name = op_desc.Input("X").front(); + auto y_name = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + bool global_pooling = boost::get(op_desc.GetAttr("global_pooling")); + std::string pool_type = + boost::get(op_desc.GetAttr("pooling_type")); + std::vector ksize = + boost::get>(op_desc.GetAttr("ksize")); + std::vector strides = + boost::get>(op_desc.GetAttr("strides")); + std::vector paddings = + boost::get>(op_desc.GetAttr("paddings")); + bool ceil_mode = boost::get(op_desc.GetAttr("ceil_mode")); + std::string anakin_pool_type; + if (pool_type == "max") { + anakin_pool_type = "MAX"; + } else if (pool_type == "avg") { + if (paddings[0] || paddings[1]) { + anakin_pool_type = "AVGEXC"; + } else { + anakin_pool_type = "AVG"; + } + } else { + PADDLE_THROW("TensorRT unsupported pooling type!"); + } + + engine_->AddOp(op_name, "Pooling", {x_name}, {y_name}); + engine_->AddOpAttr>(op_name, "pool_size", ksize); + engine_->AddOpAttr>(op_name, "strides", strides); + engine_->AddOpAttr>(op_name, "padding", paddings); + engine_->AddOpAttr(op_name, "method", anakin_pool_type); + engine_->AddOpAttr(op_name, "global_pooling", global_pooling); + engine_->AddOpAttr(op_name, "cmp_out_shape_floor_as_conv", !ceil_mode); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(pool2d, Pool2dOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/pool2d.h b/paddle/fluid/inference/anakin/convert/pool2d.h new file mode 100644 index 0000000000000000000000000000000000000000..1931a03c7ac236b4e57236cd1eb2947110f279a8 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/pool2d.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class Pool2dOpConverter : public AnakinOpConverter { + public: + Pool2dOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~Pool2dOpConverter() {} + + private: +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/registrar.h b/paddle/fluid/inference/anakin/convert/registrar.h deleted file mode 100644 index afce66ca084143ae203af9a60089aa2f5d18a725..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/anakin/convert/registrar.h +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include - -namespace paddle { -namespace inference { -namespace anakin { - -class AnakinOpConverter; - -class OpRegister { - public: - OpRegister() = default; - std::shared_ptr Get(const std::string &name); - static OpRegister *instance(); - void OpRegisterFn(const std::string &name, - std::function()> fn) { - registry_[name] = fn; - } - - private: - using RegisterFnType = std::function()>; - std::map()>> - registry_; -}; - -template -class Registrar { - public: - Registrar(const std::string &name, Args... args) { - std::shared_ptr converter = - std::make_shared(std::move(args)...); - OpRegister::instance()->OpRegisterFn(name, - [converter]() { return converter; }); - } -}; - -} // namespace anakin -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/relu.cc b/paddle/fluid/inference/anakin/convert/relu.cc new file mode 100644 index 0000000000000000000000000000000000000000..2ce96db1804a3d6d6d1afac79e4e1fc55ed4c35d --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/relu.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/relu.h" +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::saber::NV; +using anakin::saber::Shape; + +namespace paddle { +namespace inference { +namespace anakin { + +void ReluOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + auto input_name = op_desc.Input("X").front(); + auto output_name = op_desc.Output("Out").front(); + + engine_->AddOp(op_name, "ReLU", {input_name}, {output_name}); + engine_->AddOpAttr(op_name, "alpha", 0); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(relu, ReluOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/relu.h b/paddle/fluid/inference/anakin/convert/relu.h new file mode 100644 index 0000000000000000000000000000000000000000..54c4c2316eb32ef70696a2477211008e04892552 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/relu.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class ReluOpConverter : public AnakinOpConverter { + public: + ReluOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~ReluOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/reshape.cc b/paddle/fluid/inference/anakin/convert/reshape.cc new file mode 100644 index 0000000000000000000000000000000000000000..eee36d2f37ea79c841ac8bf60c6e533069d06240 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/reshape.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/reshape.h" +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::saber::NV; +using anakin::saber::Shape; +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL); + + auto input = op_desc.Input("X").front(); + auto output = op_desc.Output("Out").front(); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + engine_->AddOp(op_name, "Reshape", {input}, {output}); + + auto shape = boost::get>(op_desc.GetAttr("shape")); + if (shape.size() < 4) { + shape.insert(shape.end(), 4 - shape.size(), 1); + } + engine_->AddOpAttr>(op_name, "dims", shape); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(reshape, ReshapeOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/reshape.h b/paddle/fluid/inference/anakin/convert/reshape.h new file mode 100644 index 0000000000000000000000000000000000000000..970e8ce5572572bd18c34eeffa902fa2495c1cce --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/reshape.h @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class ReshapeOpConverter : public AnakinOpConverter { + public: + ReshapeOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~ReshapeOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/scale.cc b/paddle/fluid/inference/anakin/convert/scale.cc new file mode 100644 index 0000000000000000000000000000000000000000..6f3aa8c5d1111dc2829e241c9331eeb521003c03 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/scale.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/scale.h" +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::saber::NV; +using anakin::saber::Shape; + +namespace paddle { +namespace inference { +namespace anakin { + +void ScaleOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + auto input_name = op_desc.Input("X").front(); + auto output_name = op_desc.Output("Out").front(); + float scale = boost::get(op_desc.GetAttr("scale")); + float bias = boost::get(op_desc.GetAttr("bias")); + float bias_after_scale = + boost::get(op_desc.GetAttr("bias_after_scale")); + PADDLE_ENFORCE(bias_after_scale, + "The anakin scale layer only support bias after scale now."); + + engine_->AddOp(op_name, "Power", {input_name}, {output_name}); + engine_->AddOpAttr(op_name, "shift", bias); + engine_->AddOpAttr(op_name, "scale", scale); + engine_->AddOpAttr(op_name, "power", static_cast(1.0)); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(scale, ScaleOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/scale.h b/paddle/fluid/inference/anakin/convert/scale.h new file mode 100644 index 0000000000000000000000000000000000000000..b858e3c512494f80c7c3818a570e43d90d65251b --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/scale.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class ScaleOpConverter : public AnakinOpConverter { + public: + ScaleOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~ScaleOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/softmax.cc b/paddle/fluid/inference/anakin/convert/softmax.cc new file mode 100644 index 0000000000000000000000000000000000000000..d5cd8908ebf623f0334a3b4df2b19147c63f77a3 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/softmax.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/softmax.h" + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::saber::NV; +using anakin::saber::Shape; + +namespace paddle { +namespace inference { +namespace anakin { + +void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL); + + auto input = op_desc.Input("X").front(); + auto output = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + engine_->AddOp(op_name, "Softmax", {input}, {output}); + engine_->AddOpAttr(op_name, "axis", 2); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(softmax, SoftMaxOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/softmax.h b/paddle/fluid/inference/anakin/convert/softmax.h new file mode 100644 index 0000000000000000000000000000000000000000..0508da0c6fecaf29b7376005904235dadf04ea28 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/softmax.h @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class SoftMaxOpConverter : public AnakinOpConverter { + public: + SoftMaxOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~SoftMaxOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/split.cc b/paddle/fluid/inference/anakin/convert/split.cc new file mode 100644 index 0000000000000000000000000000000000000000..b8464a766d21e93426eb4a00b8caab2af5470055 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/split.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/split.h" +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::Precision; +using anakin::saber::NV; +using anakin::saber::X86; +using anakin::saber::Shape; +using anakin::PBlock; +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +void SplitOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + auto input_name = op_desc.Input("X").front(); + auto y_names = op_desc.Output("Out"); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + int axis = boost::get(op_desc.GetAttr("axis")); + + std::vector output_lengths = + boost::get>(op_desc.GetAttr("sections")); + + int split_num = output_lengths.size(); + PADDLE_ENFORCE(split_num > 1, + "anakin split op converter: the split num should > 1"); + int num_sum = 0; + std::vector slice_point; + for (int i = 0; i < split_num - 1; i++) { + num_sum += output_lengths[i]; + slice_point.push_back(num_sum); + } + engine_->AddOp(op_name, "Slice", {input_name}, y_names); + engine_->AddOpAttr(op_name, "axis", axis); + engine_->AddOpAttr>(op_name, "slice_point", slice_point); + // slice_dim is useless in anakin + engine_->AddOpAttr(op_name, "slice_dim", 4); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle +REGISTER_ANAKIN_OP_CONVERTER(split, SplitOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/split.h b/paddle/fluid/inference/anakin/convert/split.h new file mode 100644 index 0000000000000000000000000000000000000000..a4c6a14e62168ffaf5ff67b5cf953d477ff9e34d --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/split.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class SplitOpConverter : public AnakinOpConverter { + public: + SplitOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~SplitOpConverter() {} + + private: +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/sum.cc b/paddle/fluid/inference/anakin/convert/sum.cc new file mode 100644 index 0000000000000000000000000000000000000000..df9104cf4631d86e0cbd87cb0e93a96d984953f5 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/sum.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/sum.h" +#include +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::Precision; +using anakin::saber::NV; +using anakin::saber::X86; +using anakin::saber::Shape; +using anakin::PBlock; +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +void SumOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 2); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto input_names = op_desc.Input("X"); + auto out_name = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + std::vector coeff = {1, 1}; + std::string elementwise_type = "Add"; + engine_->AddOp(op_name, "Eltwise", input_names, {out_name}); + engine_->AddOpAttr>(op_name, "coeff", coeff); + engine_->AddOpAttr(op_name, "type", elementwise_type); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(sum, SumOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/sum.h b/paddle/fluid/inference/anakin/convert/sum.h new file mode 100644 index 0000000000000000000000000000000000000000..ddecc4b3bcb84f83af95e77399847f191c785563 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/sum.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class SumOpConverter : public AnakinOpConverter { + public: + SumOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~SumOpConverter() {} + + private: +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/test_activation_op.cc b/paddle/fluid/inference/anakin/convert/test_activation_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8bedd4a749a645829658291310347eeed1c0ea49 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_activation_op.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/activation.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +static void test_activation_op(const std::string &op_type) { + auto *converter = Registry::Global().Lookup(op_type); + PADDLE_ENFORCE(converter != nullptr); + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator(parameters, &scope); + validator.DeclInputVar("act-X", {10, 6, 1, 1}); + validator.DeclOutputVar("act-Out", {10, 6, 1, 1}); + framework::OpDesc desc; + desc.SetType(op_type); + desc.SetInput("X", {"act-X"}); + desc.SetOutput("Out", {"act-Out"}); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(5); +} + +TEST(sigm_op, test) { test_activation_op("sigmoid"); } +TEST(tanh_op, test) { test_activation_op("tanh"); } +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(sigmoid); +USE_OP(tanh); +USE_ANAKIN_CONVERTER(sigmoid); +USE_ANAKIN_CONVERTER(tanh); diff --git a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2832e1c8d167c646c9049beebc57a82fe416e62c --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc @@ -0,0 +1,71 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +TEST(batch_norm_op, test) { + std::unordered_set parameters( + {"batch_norm_scale", "batch_norm_bias", "batch_norm_mean", + "batch_norm_variance"}); + framework::Scope scope; + AnakinConvertValidation validator(parameters, &scope); + std::vector param_shape{2}; + + validator.DeclInputVar("batch_norm_X", {1, 2, 5, 5}); + validator.DeclParamVar("batch_norm_scale", param_shape); + validator.DeclParamVar("batch_norm_bias", param_shape); + validator.DeclParamVar("batch_norm_mean", param_shape); + validator.DeclParamVar("batch_norm_variance", param_shape); + validator.DeclOutputVar("batch_norm_Y", {1, 2, 5, 5}); + validator.DeclOutputVar("batch_norm_save_mean", param_shape); + validator.DeclOutputVar("batch_norm_save_variance", param_shape); + + // Prepare Op description + framework::OpDesc desc; + + desc.SetType("batch_norm"); + desc.SetInput("X", {"batch_norm_X"}); + desc.SetInput("Scale", {"batch_norm_scale"}); + desc.SetInput("Bias", {"batch_norm_bias"}); + desc.SetInput("Mean", {"batch_norm_mean"}); + desc.SetInput("Variance", {"batch_norm_variance"}); + desc.SetOutput("Y", {"batch_norm_Y"}); + desc.SetOutput("MeanOut", {"batch_norm_mean"}); + desc.SetOutput("VarianceOut", {"batch_norm_variance"}); + desc.SetOutput("SavedMean", {"batch_norm_save_mean"}); + desc.SetOutput("SavedVariance", {"batch_norm_save_variance"}); + + float eps = 1e-5f; + bool is_test = true; + desc.SetAttr("epsilon", eps); + desc.SetAttr("is_test", is_test); + + validator.SetOp(*desc.Proto()); + + std::unordered_set neglected_output = { + "batch_norm_save_mean", "batch_norm_save_variance", "batch_norm_mean", + "batch_norm_variance"}; + validator.Execute(1, neglected_output); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle +USE_OP(batch_norm); +USE_ANAKIN_CONVERTER(batch_norm); diff --git a/paddle/fluid/inference/anakin/convert/test_concat_op.cc b/paddle/fluid/inference/anakin/convert/test_concat_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ecf44def5a2429360f0bcb92f00a0423e1d491cd --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/concat.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +TEST(concat_op, test) { + std::unordered_set parameters({""}); + framework::Scope scope; + AnakinConvertValidation validator(parameters, &scope); + validator.DeclInputVar("concat_x1", {1, 2, 1, 1}); + validator.DeclInputVar("concat_x2", {1, 3, 1, 1}); + validator.DeclInputVar("concat_x3", {1, 1, 1, 1}); + validator.DeclOutputVar("concat_out", {1, 6, 1, 1}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("concat"); + desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"}); + desc.SetOutput("Out", {"concat_out"}); + + int axis = 1; + desc.SetAttr("axis", axis); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +TEST(concat_op, test2) { + std::unordered_set parameters({""}); + framework::Scope scope; + AnakinConvertValidation validator(parameters, &scope); + validator.DeclInputVar("concat_x1", {1, 4}); + validator.DeclInputVar("concat_x2", {3, 4}); + validator.DeclInputVar("concat_x3", {2, 4}); + validator.DeclOutputVar("concat_out", {6, 4}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("concat"); + desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"}); + desc.SetOutput("Out", {"concat_out"}); + + int axis = 0; + desc.SetAttr("axis", axis); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle +USE_OP(concat); +USE_ANAKIN_CONVERTER(concat); diff --git a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6d93e50bc96b08b6ef7dd7c9d836038e335daae3 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/conv2d.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +TEST(conv2d_op, test) { + auto* conv2d_converter = + Registry::Global().Lookup("conv2d"); + ASSERT_TRUE(conv2d_converter != nullptr); + std::unordered_set parameters({"conv2d-Y"}); + framework::Scope scope; + AnakinConvertValidation validator(parameters, &scope); + validator.DeclInputVar("conv2d-X", {1, 3, 3, 3}); + validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1}); + validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("conv2d"); + desc.SetInput("Input", {"conv2d-X"}); + desc.SetInput("Filter", {"conv2d-Y"}); + desc.SetOutput("Output", {"conv2d-Out"}); + + const std::vector strides({1, 1}); + const std::vector paddings({0, 0}); + const std::vector dilations({1, 1}); + const int groups = 1; + + desc.SetAttr("strides", strides); + desc.SetAttr("paddings", paddings); + desc.SetAttr("dilations", dilations); + desc.SetAttr("groups", groups); + + validator.SetOp(*desc.Proto()); + + validator.Execute(3); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(conv2d); +USE_ANAKIN_CONVERTER(conv2d); diff --git a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b2de5ae0a6e58eb25a4588571686a25500fe546c --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/dropout.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +TEST(dropout_op, native) { + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator(parameters, &scope); + validator.DeclInputVar("x", {1, 1, 2, 2}); + validator.DeclOutputVar("out", {1, 1, 2, 2}); + validator.DeclOutputVar("mask", {1, 1, 2, 2}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("dropout"); + desc.SetInput("X", {"x"}); + desc.SetOutput("Out", {"out"}); + desc.SetOutput("Mask", {"mask"}); + + float dropout_prob = 0.5; + desc.SetAttr("dropout_prob", dropout_prob); + desc.SetAttr("is_test", true); + + validator.SetOp(*desc.Proto()); + std::unordered_set neglected_output = {"mask"}; + validator.Execute(1, neglected_output); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(dropout); +USE_ANAKIN_CONVERTER(dropout); diff --git a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3a437f5fdb565609667b7a862c9b2bb13cdbeded --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc @@ -0,0 +1,56 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/elementwise.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +static void test_elementwise_op(const std::string &op_type) { + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator(parameters, &scope); + validator.DeclInputVar("x", {1, 1, 2, 2}); + validator.DeclInputVar("y", {1, 1, 2, 2}); + validator.DeclOutputVar("out", {1, 1, 2, 2}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType(op_type); + desc.SetInput("X", {"x"}); + desc.SetInput("Y", {"y"}); + desc.SetOutput("Out", {"out"}); + + int axis = -1; + desc.SetAttr("axis", axis); + + validator.SetOp(*desc.Proto()); + validator.Execute(1); +} + +TEST(elementwise_op, native_add) { test_elementwise_op("elementwise_add"); } +TEST(elementwise_op, native_mul) { test_elementwise_op("elementwise_mul"); } + +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(elementwise_add); +USE_ANAKIN_CONVERTER(elementwise_add); +USE_OP(elementwise_mul); +USE_ANAKIN_CONVERTER(elementwise_mul); diff --git a/paddle/fluid/inference/anakin/convert/test_fc_op.cc b/paddle/fluid/inference/anakin/convert/test_fc_op.cc index 7b8ceefe28873f0ffb9cedbb04b832ba029b7de4..ee6d1dc291fe3733ff2e9f66dd453120fa266a55 100644 --- a/paddle/fluid/inference/anakin/convert/test_fc_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include "paddle/fluid/inference/anakin/convert/fc.h" #include "paddle/fluid/inference/anakin/convert/op_converter.h" #include "paddle/fluid/inference/anakin/convert/ut_helper.h" @@ -22,17 +21,15 @@ namespace inference { namespace anakin { TEST(fc_op, test) { - auto fc_converter = OpRegister::instance()->Get("fc"); - ASSERT_TRUE(fc_converter != nullptr); - // Registrar register_fc("fc"); - // auto fc = std::make_shared(); + auto* fc_converter = Registry::Global().Lookup("fc"); + ASSERT_TRUE(fc_converter); std::unordered_set parameters({"mul_y"}); framework::Scope scope; - AnakinConvertValidation validator(parameters, scope); - validator.DeclInputVar("mul_x", {1, 1, 1, 1}); - validator.DeclParamVar("mul_y", {1, 2}); - validator.DeclOutputVar("mul_out", {1, 1, 1, 2}); + AnakinConvertValidation validator(parameters, &scope); + validator.DeclInputVar("mul_x", {1, 1, 2, 2}); + validator.DeclParamVar("mul_y", {4, 2}); + validator.DeclOutputVar("mul_out", {1, 2}); // Prepare Op description framework::OpDesc desc; @@ -40,8 +37,6 @@ TEST(fc_op, test) { desc.SetInput("X", {"mul_x"}); desc.SetInput("Y", {"mul_y"}); desc.SetOutput("Out", {"mul_out"}); - int num_flatten_dims = 3; - desc.SetAttr("x_num_col_dims", num_flatten_dims); validator.SetOp(*desc.Proto()); validator.Execute(10); @@ -52,3 +47,4 @@ TEST(fc_op, test) { } // namespace paddle USE_OP(mul); +USE_ANAKIN_CONVERTER(fc); diff --git a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d13281f11f03fdd75e585bce8b30e8780d81f7d7 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc @@ -0,0 +1,51 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +TEST(flatten_op, test) { + auto *converter = Registry::Global().Lookup("flatten"); + ASSERT_TRUE(converter); + + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator(parameters, &scope); + validator.DeclInputVar("flatten-X", {3, 10, 10, 4}); + validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1}); + framework::OpDesc desc; + desc.SetType("flatten"); + desc.SetInput("X", {"flatten-X"}); + desc.SetOutput("Out", {"flatten-Out"}); + desc.SetAttr("axis", 1); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(5); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(reshape); +USE_OP_ITSELF(flatten); +USE_ANAKIN_CONVERTER(flatten); diff --git a/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc b/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..5e5764633125c867e27b0b52e0e6ef18714653b2 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/im2sequence.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +TEST(im2sequence_op, native) { + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator(parameters, &scope); + + std::vector kernels = {6, 1}; + std::vector strides = {1, 1}; + std::vector paddings = {0, 0, 0, 0}; + + validator.DeclInputVar("x", {1, 1, 2, 2}); + validator.DeclOutputVar("out", {1, 1 * kernels[0] * kernels[1]}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("im2sequence"); + desc.SetInput("X", {"x"}); + desc.SetOutput("Out", {"out"}); + + desc.SetAttr("kernels", kernels); + desc.SetAttr("strides", strides); + desc.SetAttr("paddings", paddings); + + validator.SetOp(*desc.Proto()); + validator.Execute(1); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(im2sequence); +USE_ANAKIN_CONVERTER(im2sequence); diff --git a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1ac019467721605c539c7ada452d04d5134fa341 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +void test_pool2d(bool global_pooling, bool ceil_mode, + std::string pool_type = "max") { + auto* pool2d_converter = + Registry::Global().Lookup("pool2d"); + ASSERT_TRUE(pool2d_converter); + + framework::Scope scope; + std::unordered_set parameters; + AnakinConvertValidation validator(parameters, &scope); + + // The ITensor's Dims should not contain the batch size. + // So, the ITensor's Dims of input and output should be C * H * W. + validator.DeclInputVar("pool2d_x", {1, 3, 6, 7}); + if (global_pooling) + validator.DeclOutputVar("pool2d_out", {1, 3, 1, 1}); + else if (ceil_mode) + validator.DeclOutputVar("pool2d_out", {1, 3, 3, 4}); + else + validator.DeclOutputVar("pool2d_out", {1, 3, 3, 3}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("pool2d"); + desc.SetInput("X", {"pool2d_x"}); + desc.SetOutput("Out", {"pool2d_out"}); + + std::vector ksize({2, 2}); + std::vector strides({2, 2}); + std::vector paddings({0, 0}); + std::string pooling_t = pool_type; + + desc.SetAttr("pooling_type", pooling_t); + desc.SetAttr("ksize", ksize); + desc.SetAttr("strides", strides); + desc.SetAttr("paddings", paddings); + desc.SetAttr("global_pooling", global_pooling); + desc.SetAttr("ceil_mode", ceil_mode); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(1); +} + +void test_pool2d2(bool global_pooling, bool ceil_mode, + std::string pool_type = "max") { + auto* pool2d_converter = + Registry::Global().Lookup("pool2d"); + ASSERT_TRUE(pool2d_converter); + + framework::Scope scope; + std::unordered_set parameters; + AnakinConvertValidation validator(parameters, &scope); + + // The ITensor's Dims should not contain the batch size. + // So, the ITensor's Dims of input and output should be C * H * W. + validator.DeclInputVar("pool2d_x", {1, 1, 17, 17}); + validator.DeclOutputVar("pool2d_out", {1, 1, 17, 17}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("pool2d"); + desc.SetInput("X", {"pool2d_x"}); + desc.SetOutput("Out", {"pool2d_out"}); + + std::vector ksize({3, 3}); + std::vector strides({1, 1}); + std::vector paddings({1, 1}); + std::string pooling_t = pool_type; + + desc.SetAttr("pooling_type", pooling_t); + desc.SetAttr("ksize", ksize); + desc.SetAttr("strides", strides); + desc.SetAttr("paddings", paddings); + desc.SetAttr("global_pooling", global_pooling); + desc.SetAttr("ceil_mode", true); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(1); +} + +TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); } +TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); } + +TEST(Pool2dOpConverter, max_ceil_test) { test_pool2d(false, true); } +TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); } +TEST(Pool2dOpConverter, avg_ceil_test2) { test_pool2d2(false, true, "avg"); } + +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(pool2d); +USE_ANAKIN_CONVERTER(pool2d); diff --git a/paddle/fluid/inference/anakin/convert/test_relu_op.cc b/paddle/fluid/inference/anakin/convert/test_relu_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..04e624518a5a4477bbb41475b575f85be5a120d4 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_relu_op.cc @@ -0,0 +1,50 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/relu.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +static void test_activation_op(const std::string &op_type) { + auto *converter = Registry::Global().Lookup(op_type); + PADDLE_ENFORCE(converter != nullptr); + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator(parameters, &scope); + validator.DeclInputVar("act-X", {10, 6, 1, 1}); + validator.DeclOutputVar("act-Out", {10, 6, 1, 1}); + framework::OpDesc desc; + desc.SetType(op_type); + desc.SetInput("X", {"act-X"}); + desc.SetOutput("Out", {"act-Out"}); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(5); +} + +TEST(sigm_op, test) { test_activation_op("relu"); } +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(relu); +USE_ANAKIN_CONVERTER(relu); diff --git a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..306ebf510f29a87ca1ffa6df86e08f86b3f8ffbb --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +TEST(reshape, test) { + auto* converter = Registry::Global().Lookup("reshape"); + ASSERT_TRUE(converter); + framework::Scope scope; + std::unordered_set parameters; + AnakinConvertValidation validator(parameters, &scope); + + // validator.DeclInputVar("reshape-X", {2, 3, 3, 1}); + // validator.DeclOutputVar("reshape-Out", {3, 2, 1, 3}); + validator.DeclInputVar("reshape-X", {1, 2, 4, 1}); + validator.DeclOutputVar("reshape-Out", {1, 8, 1, 1}); + + framework::OpDesc desc; + desc.SetType("reshape"); + desc.SetInput("X", {"reshape-X"}); + desc.SetOutput("Out", {"reshape-Out"}); + // desc.SetAttr("shape", std::vector({3, 2, 1, 3})); + desc.SetAttr("shape", std::vector({1, 8, 1, 1})); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + validator.Execute(1); +} + +TEST(reshape, test2) { + framework::Scope scope; + std::unordered_set parameters; + AnakinConvertValidation validator(parameters, &scope); + + validator.DeclInputVar("reshape-X", {1, 2, 4}); + validator.DeclOutputVar("reshape-Out", {1, 4, 2}); + + framework::OpDesc desc; + desc.SetType("reshape"); + desc.SetInput("X", {"reshape-X"}); + desc.SetOutput("Out", {"reshape-Out"}); + // desc.SetAttr("shape", std::vector({3, 2, 1, 3})); + desc.SetAttr("shape", std::vector({0, -1, 2})); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + validator.Execute(1); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(reshape); +USE_ANAKIN_CONVERTER(reshape); diff --git a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8c14fae0a67b9e488cf072535868a34f6195ab71 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +TEST(softmax, test) { + auto* converter = Registry::Global().Lookup("softmax"); + ASSERT_TRUE(converter); + framework::Scope scope; + std::unordered_set parameters; + AnakinConvertValidation validator(parameters, &scope); + + validator.DeclInputVar("softmax-X", {1, 10, 2}); + validator.DeclOutputVar("softmax-Out", {1, 10, 2}); + + framework::OpDesc desc; + desc.SetType("softmax"); + desc.SetInput("X", {"softmax-X"}); + desc.SetOutput("Out", {"softmax-Out"}); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + validator.Execute(1); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(softmax); +USE_ANAKIN_CONVERTER(softmax); diff --git a/paddle/fluid/inference/anakin/convert/test_split_op.cc b/paddle/fluid/inference/anakin/convert/test_split_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..aa61c01a511c2337944aadbbc3d47893487de683 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_split_op.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/split.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void AnakinSliceTest(const std::vector &in_shape, + const std::vector §ions) { + std::unordered_set parameters({""}); + framework::Scope scope; + AnakinConvertValidation validator(parameters, &scope); + + validator.DeclInputVar("split_input", in_shape); + std::vector output_vars; + for (size_t i = 0; i < sections.size(); ++i) { + auto out_shape = in_shape; + out_shape[Axis] = sections[i]; + std::string output_name = "split_out" + std::to_string(i); + validator.DeclOutputVar(output_name, out_shape); + output_vars.push_back(output_name); + } + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("split"); + desc.SetInput("X", {"split_input"}); + desc.SetOutput("Out", output_vars); + + desc.SetAttr("axis", Axis); + desc.SetAttr("num", 0); + desc.SetAttr("sections", sections); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +// batch = 0, axis = 1, same shape +TEST(split_op, test_same_shape_axis1_batch1) { + AnakinSliceTest<1>({1, 4, 2, 2}, {2, 2}); +} +// batch = 0, axis = 1, different shape +TEST(split_op, test_different_shape_axis1_batch1) { + AnakinSliceTest<1>({1, 3, 2, 2}, {2, 1}); +} +// batch = 10, axis = 1, same shape +TEST(split_op, test_same_shape_axis1_batch10) { + AnakinSliceTest<1>({1, 4, 2, 2}, {2, 2}); +} +// batch = 10, axis = 1, different shape +TEST(split_op, test_different_shape_axis1_batch10) { + AnakinSliceTest<1>({1, 3, 2, 2}, {2, 1}); +} +// batch = 0, axis = 2, same shape +TEST(split_op, test_same_shape_axis2_batch1) { + AnakinSliceTest<2>({1, 3, 4, 2}, {2, 2}); +} +// batch = 0, axis = 2, different shape +TEST(split_op, test_different_shape_axis2_batch1) { + AnakinSliceTest<2>({1, 3, 3, 2}, {2, 1}); +} +// batch = 10, axis = 2, same shape +TEST(split_op, test_same_shape_axis2_batch10) { + AnakinSliceTest<2>({1, 3, 4, 2}, {2, 2}); +} +// batch = 10, axis = 2, different shape +TEST(split_op, test_different_shape_axis2_batch10) { + AnakinSliceTest<2>({1, 3, 3, 2}, {2, 1}); +} +// batch = 0, axis = 3, same shape +TEST(split_op, test_same_shape_axis3_batch1) { + AnakinSliceTest<3>({1, 3, 2, 4}, {2, 2}); +} +// batch = 0, axis = 3, different shape +TEST(split_op, test_different_shape_axis3_batch1) { + AnakinSliceTest<3>({1, 3, 2, 3}, {2, 1}); +} +// batch = 10, axis = 3, same shape +TEST(split_op, test_same_shape_axis3_batch10) { + AnakinSliceTest<3>({1, 3, 2, 4}, {2, 2}); +} +// batch = 10, axis = 3, different shape +TEST(split_op, test_different_shape_axis3_batch10) { + AnakinSliceTest<3>({1, 3, 2, 3}, {2, 1}); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(split); +USE_ANAKIN_CONVERTER(split); diff --git a/paddle/fluid/inference/anakin/convert/test_sum_op.cc b/paddle/fluid/inference/anakin/convert/test_sum_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d6a59a0166be9239b480221cc076069239403429 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_sum_op.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/sum.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" +#include "paddle/fluid/operators/sum_op.h" + +namespace paddle { +namespace inference { +namespace anakin { + +TEST(sum, native) { + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator(parameters, &scope); + validator.DeclInputVar("sum_x1", {1, 2, 1, 2}); + validator.DeclInputVar("sum_x2", {1, 2, 1, 2}); + validator.DeclOutputVar("sum_out", {1, 2, 1, 2}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("sum"); + desc.SetInput("X", {"sum_x1", "sum_x2"}); + desc.SetOutput("Out", {"sum_out"}); + + validator.SetOp(*desc.Proto()); + validator.Execute(1); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(sum); +USE_ANAKIN_CONVERTER(sum); diff --git a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..016ed26f02f782fe5032d8368f7767a5c94dfe9f --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +TEST(transpose_op, test) { + auto* converter = Registry::Global().Lookup("transpose"); + ASSERT_TRUE(converter != nullptr); + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator(parameters, &scope); + validator.DeclInputVar("transpose-X", {2, 3, 4, 5}); + validator.DeclOutputVar("transpose-Out", {4, 2, 5, 3}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("transpose"); + desc.SetInput("X", {"transpose-X"}); + desc.SetOutput("Out", {"transpose-Out"}); + desc.SetAttr("axis", std::vector({2, 0, 3, 1})); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(3); +} + +// test input shape's dims < 4 +TEST(transpose_op, test2) { + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator(parameters, &scope); + validator.DeclInputVar("transpose-X", {3, 4, 5}); + validator.DeclOutputVar("transpose-Out", {3, 5, 4}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("transpose"); + desc.SetInput("X", {"transpose-X"}); + desc.SetOutput("Out", {"transpose-Out"}); + desc.SetAttr("axis", std::vector({0, 2, 1})); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(1); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(transpose); +USE_ANAKIN_CONVERTER(transpose); diff --git a/paddle/fluid/inference/anakin/convert/transpose.cc b/paddle/fluid/inference/anakin/convert/transpose.cc new file mode 100644 index 0000000000000000000000000000000000000000..6a887401034f9d8c0b8b6aa3eeffb6579e395029 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/transpose.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/transpose.h" +#include +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::saber::NV; +using anakin::saber::Shape; +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +void TransposeOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto input = op_desc.Input("X").front(); + auto output = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + engine_->AddOp(op_name, "Permute", {input}, {output}); + + auto axis = boost::get>(op_desc.GetAttr("axis")); + size_t axis_size = axis.size(); + while (axis.size() < 4) { + axis.push_back(axis_size); + axis_size += 1; + } + engine_->AddOpAttr>(op_name, "dims", axis); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(transpose, TransposeOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/transpose.h b/paddle/fluid/inference/anakin/convert/transpose.h new file mode 100644 index 0000000000000000000000000000000000000000..62d26b6a9cc9885682f5750df32018596f014b33 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/transpose.h @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class TransposeOpConverter : public AnakinOpConverter { + public: + TransposeOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~TransposeOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h index 38d8e596a738ac98c9f9870473f72dcc72b0e7aa..e0371d95347a521f499dd9454d284907b3048a04 100644 --- a/paddle/fluid/inference/anakin/convert/ut_helper.h +++ b/paddle/fluid/inference/anakin/convert/ut_helper.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include #include @@ -24,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" #include "paddle/fluid/inference/anakin/engine.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -82,7 +84,7 @@ class AnakinConvertValidation { AnakinConvertValidation() = delete; AnakinConvertValidation(const std::unordered_set& parameters, - const framework::Scope& scope) + framework::Scope* scope) : parameters_(parameters), scope_(scope), place_(0) { PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); engine_.reset(new AnakinEngine(true)); @@ -106,7 +108,7 @@ class AnakinConvertValidation { void DeclVar(const std::string& name, const std::vector dim_vec) { platform::CUDADeviceContext ctx(place_); - auto* x = scope_.Var(name); + auto* x = scope_->Var(name); auto* x_tensor = x->GetMutable(); x_tensor->Resize(framework::make_ddim(dim_vec)); RandomizeTensor(x_tensor, place_, ctx); @@ -118,15 +120,22 @@ class AnakinConvertValidation { // should init anakin engine here. Singleton::Global().ConvertOp( - desc, parameters_, scope_, engine_.get(), true /*test_mode*/); + desc, parameters_, *scope_, engine_.get(), true /*test_mode*/); engine_->Freeze(); + + std::map> temp_max_input_shape; for (const auto& input : op_desc_->InputArgumentNames()) { if (parameters_.count(input)) continue; - auto& t = inference::analysis::GetFromScope(scope_, + auto& t = inference::analysis::GetFromScope(*scope_, input); auto t_shape = framework::vectorize2int(t.dims()); + while (t_shape.size() < 4) { + t_shape.push_back(1); + } engine_->SetInputShape(input, t_shape); + temp_max_input_shape[input] = t_shape; } + engine_->SetMaxInputShape(temp_max_input_shape); engine_->Optimize(); engine_->InitGraph(); } @@ -138,14 +147,14 @@ class AnakinConvertValidation { std::unordered_set neglected_output = {}) { // Execute Fluid Op platform::CUDADeviceContext ctx(place_); - op_->Run(scope_, place_); + op_->Run(*scope_, place_); // std::vector input_vector; // std::vector output_vector; std::map inputs; for (const auto& input : op_desc_->InputArgumentNames()) { if (parameters_.count(input)) continue; - auto* var = scope_.FindVar(input); + auto* var = scope_->FindVar(input); auto tensor = var->GetMutable(); inputs.insert({input, tensor}); } @@ -155,45 +164,38 @@ class AnakinConvertValidation { for (const auto& output : op_desc_->OutputArgumentNames()) { if (neglected_output.count(output)) continue; std::vector fluid_out; - auto* var = scope_.FindVar(output); + auto* var = scope_->FindVar(output); auto tensor = var->GetMutable(); framework::TensorToVector(*tensor, ctx, &fluid_out); fluid_outputs.push_back(fluid_out); - // size_t fluid_out_size = fluid_out.size(); - /*for (size_t i = 0; i < fluid_out_size; i++) { - std::cout << fluid_out[i] << std::endl; - }*/ outputs.insert({output, tensor}); } - engine_->Execute(inputs, outputs); + engine_->Execute(inputs, outputs, stream_); int i_output = 0; for (const auto& output : op_desc_->OutputArgumentNames()) { if (neglected_output.count(output)) continue; std::vector anakin_out; - auto* var = scope_.FindVar(output); + auto* var = scope_->FindVar(output); auto tensor = var->GetMutable(); framework::TensorToVector(*tensor, ctx, &anakin_out); size_t anakin_out_size = anakin_out.size(); auto fluid_out = fluid_outputs[i_output++]; for (size_t i = 0; i < anakin_out_size; i++) { - LOG(INFO) << "Output[" << i << "]: anakin[" << anakin_out[i] << "], " - << "fluid[" << fluid_out[i] << "]"; + EXPECT_LT(std::abs(fluid_out[i] - anakin_out[i]), 1e-3); } } } - framework::Scope& scope() { return scope_; } - private: std::unique_ptr engine_{nullptr}; cudaStream_t stream_; std::unique_ptr op_; std::unique_ptr op_desc_; const std::unordered_set& parameters_; - framework::Scope& scope_; + framework::Scope* scope_; platform::CUDAPlace place_; }; diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc index 6549991474f4834f0c3ef74c60d294cca6bebc91..ccf78ad7e56306d24af829c45c888021f4e3fbc4 100644 --- a/paddle/fluid/inference/anakin/engine.cc +++ b/paddle/fluid/inference/anakin/engine.cc @@ -33,9 +33,15 @@ namespace inference { namespace anakin { template -AnakinEngine::AnakinEngine(bool need_summary) +AnakinEngine::AnakinEngine( + bool need_summary, int device, int max_batch_size, + std::map> max_input_shape) : graph_(new AnakinGraphT()), - net_(new AnakinNetT(need_summary)) {} + net_(new AnakinNetT(need_summary)) { + device_ = device; + max_batch_size_ = max_batch_size; + max_input_shape_ = max_input_shape; +} template AnakinEngine::~AnakinEngine() {} @@ -63,34 +69,53 @@ void AnakinEngine::AddOp( template void AnakinEngine::Execute( const std::map &inputs, - const std::map &outputs) { + const std::map &outputs, + cudaStream_t stream) { + cudaDeviceSynchronize(); for (const auto &input : inputs) { auto *tensor = input.second; auto *data = tensor->data(); - auto shape = framework::vectorize2int(tensor->dims()); - ::anakin::saber::Shape anakin_shape(shape); + + auto fluid_input_shape = framework::vectorize2int(tensor->dims()); + while (fluid_input_shape.size() < 4) { + fluid_input_shape.push_back(1); + } auto *anakin_input = net_->get_in(input.first); + std::vector max_input_shape = max_input_shape_[input.first]; + int max_shape_sum = + std::accumulate(max_input_shape.begin(), max_input_shape.end(), 1, + std::multiplies()); + + PADDLE_ENFORCE(max_shape_sum >= tensor->numel(), + "The anakin input max shape should be greater than" + " or equal to the real input shape, Please set the max " + "input shape using EnableAnakinEngine"); + anakin_input->reshape(fluid_input_shape); + ::anakin::saber::Tensor tmp_anakin_tensor(data, TargetT(), 0, - anakin_shape); - anakin_input->share_from(tmp_anakin_tensor); + fluid_input_shape); + anakin_input->copy_from(tmp_anakin_tensor); } - + net_->prediction(); + cudaDeviceSynchronize(); for (const auto &output : outputs) { + platform::CUDAPlace gpu_place(device_); auto *tensor = output.second; - auto *data = tensor->data(); - auto shape = framework::vectorize2int(tensor->dims()); - ::anakin::saber::Shape anakin_shape(shape); auto *anakin_output = net_->get_out(output.first); - ::anakin::saber::Tensor tmp_anakin_tensor(data, TargetT(), 0, - anakin_shape); - anakin_output->share_from(tmp_anakin_tensor); + auto *anakin_data = anakin_output->data(); + auto anakin_output_shape = anakin_output->valid_shape(); + tensor->Resize(framework::make_ddim(anakin_output_shape)); + auto *fluid_data = tensor->mutable_data(gpu_place); + memory::Copy(gpu_place, static_cast(fluid_data), gpu_place, + static_cast(anakin_data), + tensor->numel() * sizeof(float), stream); } - net_->prediction(); + cudaDeviceSynchronize(); } template void AnakinEngine::Freeze() { - PADDLE_ENFORCE(graph_->Freeze(), "Freeze anakin subgraph."); + PADDLE_ENFORCE(graph_->Freeze_v3(), "Freeze anakin subgraph."); } template diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h index d8f32f57be5aabb91ba720c6457a03f15083db43..4845ffdf5b9dcfa99d1f421d47328beb4b196298 100644 --- a/paddle/fluid/inference/anakin/engine.h +++ b/paddle/fluid/inference/anakin/engine.h @@ -15,9 +15,11 @@ #pragma once #include +#include #include #include #include +#include #include #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/engine.h" @@ -26,8 +28,12 @@ #include "framework/core/net/net.h" #include "framework/core/types.h" #include "framework/graph/graph.h" +#include "framework/graph/graph_global_mem.h" #include "saber/saber_types.h" +using anakin::Precision; +using anakin::saber::NV; + namespace anakin { template @@ -46,8 +52,13 @@ namespace anakin { template class AnakinEngine { + using NetT = ::anakin::Net; + using GraphT = ::anakin::graph::Graph; + public: - explicit AnakinEngine(bool need_summary = false); + explicit AnakinEngine( + bool need_summary = false, int device = 0, int max_batch_size = 1, + std::map> max_input_shape = {}); ~AnakinEngine(); void InitGraph(); void SetInputShape(const std::string &name, std::vector shape); @@ -61,20 +72,72 @@ class AnakinEngine { PADDLE_ENFORCE(graph_->AddOpAttr(op_name, attr_name, attr_value), "Add operation's attribution."); } - + NetT *Net() { return net_.get(); } + GraphT *Graph() { return graph_.get(); } std::unique_ptr Clone(); + const std::map> &GetMaxInputShape() { + return max_input_shape_; + } + void SetMaxInputShape(std::map> shape) { + max_input_shape_ = shape; + } + int GetMaxBatchSize() { return max_batch_size_; } void Freeze(); void Optimize(); + void AllocTmpMem() { + PADDLE_ENFORCE(net_->alloc_memory_first(*graph_), + "anakin alloc temp memory first failed"); + } + void Save(std::string path) { graph_->save(path); } + + bool IsInit() { return initialized_; } + int GetDevice() { return device_; } void Execute(const std::map &inputs, - const std::map &outputs); + const std::map &outputs, + cudaStream_t stream); private: - using NetT = ::anakin::Net; - using GraphT = ::anakin::graph::Graph; + bool initialized_{false}; + int max_batch_size_; + std::map> max_input_shape_; + int device_; std::unique_ptr graph_; std::unique_ptr net_; }; +class AnakinEngineManager { + using AnakinNvEngineT = AnakinEngine; + + public: + bool HasEngine(const std::string &name) const { + if (engines_.count(name) == 0) return false; + return engines_.at(name).get() != nullptr; + } + AnakinNvEngineT *Get(const std::string &name) const { + return engines_.at(name).get(); + } + + AnakinNvEngineT *Create( + bool need_summary, int device, int max_batch_size, + std::map> max_input_shape, + std::string engine_name) { + std::unique_lock lk(mut_); + auto *p = new AnakinEngine( + need_summary, device, max_batch_size, max_input_shape); + engines_[engine_name].reset(p); + return p; + } + + void DeleteALL() { + for (auto &item : engines_) { + item.second.reset(nullptr); + } + } + + private: + std::unordered_map> engines_; + std::mutex mut_; +}; } // namespace anakin } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/anakin/op_teller.cc b/paddle/fluid/inference/anakin/op_teller.cc new file mode 100644 index 0000000000000000000000000000000000000000..90cf021de2f9d365fd1fa21f7d189d3fcd9d3ab2 --- /dev/null +++ b/paddle/fluid/inference/anakin/op_teller.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/op_teller.h" + +namespace paddle { +namespace inference { +namespace anakin { + +// Just tell by the op_types. +struct SimpleOpTypeSetTeller : public Teller { + SimpleOpTypeSetTeller() { + teller_set.insert("mul"); + teller_set.insert("fc"); + teller_set.insert("conv2d_fusion"); + teller_set.insert("split"); + teller_set.insert("relu"); + teller_set.insert("pool2d"); + teller_set.insert("elementwise_add"); + teller_set.insert("elementwise_mul"); + teller_set.insert("concat"); + teller_set.insert("tanh"); + teller_set.insert("conv2d"); + teller_set.insert("batch_norm"); + teller_set.insert("softmax"); + teller_set.insert("flatten2"); + teller_set.insert("reshape2"); + teller_set.insert("transpose2"); + teller_set.insert("density_prior_box"); + teller_set.insert("detection_out"); + teller_set.insert("dropout"); + teller_set.insert("sigmoid"); + teller_set.insert("sum"); + } + + bool operator()(const std::string& op_type, + const framework::OpDesc& desc) override { + return teller_set.count(op_type); + } + + private: + std::unordered_set teller_set; +}; + +bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) { + for (auto& teller : tellers_) { + if ((*teller)(op_type, desc)) return true; + } + return false; +} + +OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); } + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/op_teller.h b/paddle/fluid/inference/anakin/op_teller.h new file mode 100644 index 0000000000000000000000000000000000000000..15a42067b8438e60851a50e454abde95782d90ee --- /dev/null +++ b/paddle/fluid/inference/anakin/op_teller.h @@ -0,0 +1,70 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/framework/op_desc.h" + +namespace paddle { +namespace inference { +namespace anakin { + +/* + * Single Op teller definition. + * One can override this and define a more complex tell logic, considerring more + * issues such as op_desc. + */ +struct Teller { + virtual bool operator()(const std::string& op_type, + const framework::OpDesc& desc) = 0; + + virtual ~Teller() = default; +}; +/* + * A real example: + * + * struct SomeTeller : public Teller { + * bool operator()(const std::string& op_type, + * const framework::OpDesc& desc) override { + * return op_type == "fc" && desc.Inputs().size() == 2; + * } + *}; + */ + +/* + * class OpTeller helps to tell whether a fluid + * operator can be transformed to a TensorRT layer. + */ +class OpTeller { + public: + static OpTeller& Global() { + static std::unique_ptr x(new OpTeller); + return *x; + } + + bool Tell(const std::string& op_type, const framework::OpDesc& desc); + + private: + OpTeller(); + + private: + std::vector> tellers_; +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/test_anakin_engine.cc b/paddle/fluid/inference/anakin/test_anakin_engine.cc index 571294d3e22fb9489686bfcb2f3a64198099f970..8fd6b8bec9ada6dec67fd24a2457713203431ebf 100644 --- a/paddle/fluid/inference/anakin/test_anakin_engine.cc +++ b/paddle/fluid/inference/anakin/test_anakin_engine.cc @@ -17,9 +17,6 @@ limitations under the License. */ #include -#include "framework/core/net/net.h" -#include "framework/graph/graph.h" -#include "framework/graph/graph_global_mem.h" #include "paddle/fluid/inference/anakin/engine.h" using anakin::graph::GraphGlobalMem; @@ -84,7 +81,9 @@ TEST_F(TestAnakinEngine, Execute) { auto *y_data = y.mutable_data(platform::CUDAPlace()); std::map outputs = {{"y", &y}}; - engine_->Execute(inputs, outputs); + cudaStream_t stream; + + engine_->Execute(inputs, outputs, stream); auto *y_data_gpu = y_data; float y_data_cpu[2]; cudaMemcpy(y_data_cpu, y_data_gpu, sizeof(float) * 2, cudaMemcpyDeviceToHost); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 2f31b182af7293488719e41a92b2ea78709bda02..29f16943e0c13fbe080e8e073b081583f1d14d11 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -23,8 +23,14 @@ #pragma once +#include +#include #include +#include +#include +#include #include + #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" @@ -34,8 +40,14 @@ namespace paddle { namespace inference { namespace analysis { + using framework::ir::Graph; +#ifdef PADDLE_WITH_MKLDNN +using VarQuantScale = + std::unordered_map>; +#endif + /* * The argument definition of both Pass and PassManagers. * @@ -47,6 +59,8 @@ struct Argument { using unique_ptr_t = std::unique_ptr>; using fusion_statis_t = std::unordered_map; + using engine_opt_info_t = std::map; + using anakin_max_shape_t = std::map>; bool Has(const std::string& key) const { return valid_fields_.count(key); } @@ -99,12 +113,14 @@ struct Argument { private: \ unique_ptr_t field__##_; + DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int); // Model path DECL_ARGUMENT_FIELD(model_dir, ModelDir, std::string); // Model specified with program and parameters files. DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string); DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string); DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool); + DECL_ARGUMENT_FIELD(engine_opt_info, EngineOptInfo, engine_opt_info_t); // The overall graph to work on. DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph); @@ -124,6 +140,19 @@ struct Argument { DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes, std::unordered_set); +#ifdef PADDLE_WITH_MKLDNN + // A set of op types to enable their quantized kernels + DECL_ARGUMENT_FIELD(quantize_enabled_op_types, QuantizeEnabledOpTypes, + std::unordered_set); + + // A set of op IDs to exclude from enabling their quantized kernels + DECL_ARGUMENT_FIELD(quantize_excluded_op_ids, QuantizeExcludedOpIds, + std::unordered_set); + + // Scales for variables to be quantized + DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale); +#endif + // Passed from config. DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); @@ -133,6 +162,13 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode, AnalysisConfig::Precision); + DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine, + bool); + + DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape, + anakin_max_shape_t); + DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int); + DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool); // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 59107f28080dceb0a58e17d42281db5f3773de56..a48058400241b030f17557156a4d973fca92fd8d 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -17,10 +17,12 @@ limitations under the License. */ #include #include #include +#include #include #include #include #include +#include #include #include "paddle/fluid/framework/framework.pb.h" @@ -217,6 +219,35 @@ static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir, return ""; } +static std::string GetTrtEngineSerializedPath(const std::string &model_root, + const std::string &engine_key) { + return model_root + "/trt_serialized_" + engine_key; +} + +static std::string GetTrtEngineSerializedData( + const std::string &model_opt_cache_dir, const std::string &engine_key) { + std::string trt_serialized_path = + GetTrtEngineSerializedPath(model_opt_cache_dir, engine_key); + if (FileExists(trt_serialized_path)) { + VLOG(3) << "Trt serialized file: " << trt_serialized_path + << "is found here"; + std::ifstream infile(trt_serialized_path, std::ios::in); + std::stringstream buffer; + buffer << infile.rdbuf(); + std::string trt_engine_serialized_data(buffer.str()); + return trt_engine_serialized_data; + } + return ""; +} + +static void SaveTrtEngineSerializedDataToFile( + const std::string &trt_serialized_path, + const std::string &engine_serialized_data) { + std::ofstream outfile(trt_serialized_path); + outfile << engine_serialized_data; + outfile.close(); +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 8d5ee36ae627deccd7ddbd4bf8c5354a82c5e9db..7a96ac11d8ef754f38070862a70744947412882b 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -13,7 +13,12 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/ir_pass_manager.h" +#include +#include #include +#include +#include +#include #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" @@ -55,14 +60,23 @@ void IRPassManager::CreatePasses(Argument *argument, ".dot"; pass->Set("graph_viz_path", new std::string(std::move(dot_file_path))); pass_num++; - } - if (pass_name == "mkldnn_placement_pass") { + } else if (pass_name == "mkldnn_placement_pass") { pass->Set("mkldnn_enabled_op_types", new std::unordered_set( argument->mkldnn_enabled_op_types())); - } - - if (pass_name == "tensorrt_subgraph_pass") { +#ifdef PADDLE_WITH_MKLDNN + } else if (pass_name == "cpu_quantize_placement_pass") { + pass->Set("quantize_enabled_op_types", + new std::unordered_set( + argument->quantize_enabled_op_types())); + pass->Set( + "quantize_excluded_op_ids", + new std::unordered_set(argument->quantize_excluded_op_ids())); + } else if (pass_name == "cpu_quantize_pass") { + pass->Set("quant_var_scales", + new VarQuantScale(argument->quant_var_scales())); +#endif + } else if (pass_name == "tensorrt_subgraph_pass") { pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("min_subgraph_size", @@ -74,13 +88,40 @@ void IRPassManager::CreatePasses(Argument *argument, AnalysisConfig::Precision::kInt8; pass->Set("enable_int8", new bool(enable_int8)); - std::string model_opt_cache_dir = - argument->Has("model_dir") - ? argument->model_dir() - : GetDirRoot(argument->model_program_path()); - pass->Set( - "model_opt_cache_dir", - new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); + + bool use_static_engine = argument->tensorrt_use_static_engine(); + bool model_from_memory = argument->model_from_memory(); + bool int8_valid = !(model_from_memory && enable_int8); + PADDLE_ENFORCE(int8_valid, + "TRT INT8 Now don't support model load from memory."); + + if ((!model_from_memory && use_static_engine) || enable_int8) { + std::string model_opt_cache_dir = + argument->Has("model_dir") + ? argument->model_dir() + : GetDirRoot(argument->model_program_path()); + pass->Set( + "model_opt_cache_dir", + new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); + } + pass->Set("gpu_device_id", new int(argument->gpu_device_id())); + pass->Set("use_static_engine", new bool(use_static_engine)); + pass->Set("model_from_memory", new bool(argument->model_from_memory())); + pass->Set("engine_opt_info", new std::map( + argument->engine_opt_info())); + } + + if (pass_name == "anakin_subgraph_pass") { + pass->Set("program", + new framework::ProgramDesc *(&argument->main_program())); + pass->Set("gpu_device_id", new int(argument->gpu_device_id())); + pass->Set("model_from_memory", new bool(argument->model_from_memory())); + pass->Set("engine_opt_info", new std::map( + argument->engine_opt_info())); + pass->Set("predictor_id", new int(argument->predictor_id())); + pass->Set("max_input_shape", new std::map>( + argument->anakin_max_input_shape())); + pass->Set("max_batch_size", new int(argument->anakin_max_batch_size())); } pre_pass = pass_name; diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h index 2a595cb36b8345157b3fd26afc62aabfa98b87bc..2d120679eedd392d78b4da66276297ff7280792b 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.h +++ b/paddle/fluid/inference/analysis/ir_pass_manager.h @@ -22,7 +22,10 @@ #pragma once +#include #include +#include +#include #include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index 410a90132aa7657a23b858570763547fe53730a0..05a3d7ddfdb08c98866cc0a08ec4113866c7567d 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) +cc_library(subgraph_detector SRCS subgraph_detector.cc subgraph_util.cc DEPS proto_desc) if(WITH_TESTING) add_dependencies(subgraph_detector gtest) endif() @@ -14,3 +14,15 @@ if (WITH_GPU AND TENSORRT_FOUND) file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n") set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "") endif() + +if (ANAKIN_FOUND) + cc_library(anakin_subgraph_pass SRCS anakin_subgraph_pass.cc DEPS subgraph_detector anakin_op_teller) + + set(analysis_deps ${analysis_deps} + subgraph_detector anakin_subgraph_pass + CACHE INTERNAL "") + + set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) + file(APPEND ${pass_file} "USE_PASS(anakin_subgraph_pass);\n") + set(INFER_IR_PASSES ${INFER_IR_PASSES} anakin_subgraph_pass CACHE INTERNAL "") +endif() diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..12deed2533bba713701849d58f8c5cf3269b85da --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc @@ -0,0 +1,219 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/op_teller.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace inference { +namespace analysis { + +using framework::ir::Node; + +std::unique_ptr analysis::AnakinSubgraphPass::ApplyImpl( + std::unique_ptr graph) const { + framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph.get()); + + auto teller = [](const framework::ir::Node *node) { + if (!node->IsOp() || !node->Op()) return false; + return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op()); + }; + + SubGraphFuser fuser(graph.get(), teller, 6 /* min_subgraph_size */); + fuser(); + + std::vector graph_param_names = + ExtractParameters(graph->Nodes()); + + // those parameter already exist in anakin, and should not have another copy + // in fluid. + std::vector repetitive_params; + + for (auto *node : graph->Nodes()) { + if (node->IsOp() && !Agent(node).subgraph()->empty()) { + CreateAnakinOp(node, graph.get(), graph_param_names, &repetitive_params); + std::unordered_set nodes2remove( + Agent(node).subgraph()->begin(), Agent(node).subgraph()->end()); + framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); + } + } + + std::unordered_set nodes2remove; + for (auto *node : graph->Nodes()) { + if (node->IsOp() && Agent(node).deleted()) { + nodes2remove.insert(node); + } + } + framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); + graph->Set(framework::ir::kRepetitiveParamAttr, + new std::vector(repetitive_params)); + + return graph; +} + +std::string GenerateAnakinEngineKey(const std::set &engine_inputs, + const std::set &engine_outputs, + std::string id) { + std::string engine_hash_key = ""; + for (auto name : engine_inputs) { + engine_hash_key += name; + } + for (auto name : engine_outputs) { + engine_hash_key += name; + } + engine_hash_key += id; + auto engine_key = std::to_string(std::hash()(engine_hash_key)); + return engine_key; +} + +void AnakinSubgraphPass::CreateAnakinOp( + framework::ir::Node *node, Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const { + auto *op_desc = node->Op(); + auto &subgraph = *Agent(node).subgraph(); + PADDLE_ENFORCE(!subgraph.empty()); + + framework::ProgramDesc *program_desc = + Get("program"); + // Add new block for TensorRTEngineOP + const framework::BlockDesc &main_block = + program_desc->Block(framework::kRootBlockIndex); + // const framework::BlockDesc& main_block = program_desc->Block(0); + framework::BlockDesc *new_block = program_desc->AppendBlock(main_block); + + // An fake block desc. + framework::proto::BlockDesc block_proto; + framework::BlockDesc block_desc(nullptr, &block_proto); + block_desc.Proto()->set_parent_idx(-1); + block_desc.Proto()->set_idx(0); + string::PrettyLogDetail("--- detect a sub-graph with %d nodes", + subgraph.size()); + + for (auto *node : subgraph) { + auto *new_block_op = new_block->AppendOp(); + auto *op = block_desc.AppendOp(); + *new_block_op->Proto() = *node->Op()->Proto(); + *op->Proto() = *node->Op()->Proto(); + } + + // Then, we will use the input_names_with_id and output_names_with_id to + // generate the eigine key. + // So, We use set instead of unordered_set here to ensure that the engine key + // is unique. + std::set input_names; + std::set input_names_with_id; + std::vector params; + for (auto *x : node->inputs) { + input_names.insert(x->Name()); + input_names_with_id.insert(x->Name() + std::to_string(x->id())); + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) { + params.push_back(x->Name()); + } + } + std::copy(params.begin(), params.end(), + std::back_inserter(*repetitive_params)); + op_desc->SetInput( + "Xs", std::vector(input_names.begin(), input_names.end())); + + std::set output_names; + std::set output_names_with_id; + for (auto *x : node->outputs) { + output_names.insert(x->Name()); + output_names_with_id.insert(x->Name() + std::to_string(x->id())); + } + + op_desc->SetOutput( + "Ys", std::vector(output_names.begin(), output_names.end())); + op_desc->SetType("anakin_engine"); + + std::unordered_map output_name_map; + auto &subgraph_nodes = *Agent(node).subgraph(); + + // The following procedure is used to rename all the intermediate + // variables and the output variables of the subgraph. + RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id, + &output_names_with_id, &output_names, &output_name_map, + false); + + // When anakin engine runs at the end of the operation, + // output_mapping help us copy the data from the renamed ITensor + // to Tensor. + std::vector output_mapping; + for (auto name : output_names) { + PADDLE_ENFORCE(output_name_map.count(name) != 0); + output_mapping.push_back(output_name_map[name]); + } + + auto *vars = block_desc.Proto()->mutable_vars(); + for (framework::ir::Node *node : graph->Nodes()) { + if (node->IsVar() && node->Var()) { + *vars->Add() = *node->Var()->Proto(); + } + } + + PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), + "the block has no var-desc"); + PADDLE_ENFORCE(!output_mapping.empty()); + op_desc->SetBlockAttr("sub_block", new_block); + SetAttr(op_desc->Proto(), "subgraph", + block_desc.Proto()->SerializeAsString()); + // Set attrs + SetAttr(op_desc->Proto(), "parameters", params); + SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); + int predictor_id = Get("predictor_id"); + auto engine_key = GenerateAnakinEngineKey( + input_names_with_id, output_names_with_id, std::to_string(predictor_id)); + + SetAttr(op_desc->Proto(), "engine_key", engine_key); + auto max_input_shape = + Get>>("max_input_shape"); + auto max_batch_size = Get("max_batch_size"); + + auto *anakin_engine = + inference::Singleton::Global().Create( + true, Get("gpu_device_id"), max_batch_size, max_input_shape, + engine_key); + + auto *scope = param_scope(); + std::unordered_set param_set(params.begin(), params.end()); + framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); + + inference::Singleton::Global() + .ConvertBlockToAnakinEngine( + &block_desc_temp, scope, + std::vector(input_names.begin(), input_names.end()), + param_set, output_mapping, anakin_engine); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle + +REGISTER_PASS(anakin_subgraph_pass, + paddle::inference::analysis::AnakinSubgraphPass); diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..c13b9ecda42336a79187185070104ba9ac4b67bc --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/anakin/engine.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" + +using anakin::Precision; +using anakin::saber::NV; +namespace paddle { +namespace inference { +namespace analysis { + +class AnakinSubgraphPass : public framework::ir::FusePassBase { + public: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; + + private: + void CreateAnakinOp(framework::ir::Node *x, framework::ir::Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const; + void CleanIntermediateOutputs(framework::ir::Node *node); +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc index 96befe7f8a5d16402338ac337daa96d714b4d310..76b1671601eec95d64b36effc5727481dcd070e2 100644 --- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include +#include +#include #include #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" @@ -418,7 +420,7 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() { // Node that contains this subgraph 2. Mark the nodes inside the sub-graph // as deleted. 3. Replace the deleted node with the new Block Node. framework::OpDesc empty_desc; - empty_desc.SetType("tensorrt_engine"); + empty_desc.SetType("anakin_engine"); auto *block_node = graph_->CreateOpNode(&empty_desc); Agent(block_node).set_subgraph({}); auto io = ExtractInputAndOutputOfSubGraph(subgraph); diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..a17ee1b707a7f950cddc62373a9a57c793d5528f --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc @@ -0,0 +1,152 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file defines the the class to partition a graph. + */ + +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" +#include +#include + +namespace paddle { +namespace inference { +namespace analysis { +using framework::ir::Node; + +std::vector ExtractParameters( + const std::unordered_set &nodes) { + // We can judge whether a variable is a parameter by + // its presistable property, but sometimes the presistable + // of the feed op output is true, so we have to identify it. + std::vector feed_outputs; + for (const auto &node : nodes) { + if (!node->IsOp()) continue; + std::string op_type = node->Op()->Type(); + if (op_type == "feed" || op_type == "fetch") { + std::vector output_names = node->Op()->OutputArgumentNames(); + std::copy(output_names.begin(), output_names.end(), + std::back_inserter(feed_outputs)); + } + } + + std::vector parameters; + for (const auto &node : nodes) { + if (!node->IsVar()) continue; + if (node->Var()->Persistable() && + std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) == + feed_outputs.end()) { + parameters.push_back(node->Name()); + } + } + return parameters; +} + +void RenameAndGetOutputs( + const std::vector &subgraph_nodes, + framework::BlockDesc *block_desc, + const std::set &input_names_with_id, + std::set *output_names_with_id, + std::set *output_names, + std::unordered_map *output_name_map, + bool is_trt) { + //// In the normal case, the paddle-trt exists bug when runing the googlenet. + // When there are more than two convolutions of 1 * 1 with the same input, the + // paddle-tensorrt will do the merging optimization, which fuse those conv + // into one conv, and then trigger bug. So, We should use strategy to avoid + // this optimization for the time being. This bug will be fixed in the future. + std::unordered_map + same_hierarchy_conv2d_num_map; + + for (size_t index = 0; index < block_desc->OpSize(); ++index) { + framework::proto::OpDesc *op = block_desc->Op(index)->Proto(); + framework::OpDesc op_desc(*op, nullptr); + auto correspond_node = subgraph_nodes[index]; + PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); + + std::unordered_map var2id; + std::unordered_map in_vars; + for (auto *in_var : correspond_node->inputs) { + var2id[in_var->Name()] = in_var->id(); + in_vars[in_var->Name()] = in_var; + } + // rename for the input variables of op inside subgraph + for (int i = 0; i < op->inputs_size(); i++) { + // one input + auto *in_var = op->mutable_inputs(i); + std::vector replaced_names; + for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments + std::string arg_value = in_var->arguments(k); + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (input_names_with_id.count(arg_value_with_id)) { + replaced_names.push_back(arg_value); + } else { + replaced_names.push_back(arg_value_with_id); + } + } + in_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + in_var->add_arguments(replaced_names[k]); + } + } + var2id.clear(); + for (auto out_var : correspond_node->outputs) { + var2id[out_var->Name()] = out_var->id(); + } + + if (op_desc.Type() == "conv2d" && is_trt) { + auto input_var_name = op_desc.Input("Input").front(); + auto filter_var_name = op_desc.Input("Filter").front(); + auto out_var_name = op_desc.Output("Output").front(); + auto filter_shape = in_vars[filter_var_name]->Var()->GetShape(); + const std::vector strides = + boost::get>(op_desc.GetAttr("strides")); + const std::vector paddings = + boost::get>(op_desc.GetAttr("paddings")); + if (same_hierarchy_conv2d_num_map[input_var_name] > 0) { + (*output_names_with_id) + .insert(out_var_name + std::to_string(var2id[out_var_name])); + (*output_names).insert(out_var_name); + } else if (filter_shape[2] == 1 && filter_shape[3] == 1 && + strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 && + paddings[1] == 0) { + same_hierarchy_conv2d_num_map[input_var_name] += 1; + } + } + + // rename for the output variables of op inside subgraph + for (int i = 0; i < op->outputs_size(); i++) { + framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); + std::vector replaced_names; + for (int k = 0; k < out_var->arguments_size(); k++) { + std::string arg_value = out_var->arguments(k); + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (output_names_with_id->count(arg_value_with_id)) { + (*output_name_map)[arg_value] = arg_value_with_id; + } + replaced_names.push_back(arg_value_with_id); + } + out_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + out_var->add_arguments(replaced_names[k]); + } + } + } +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h new file mode 100644 index 0000000000000000000000000000000000000000..3cf21bf5f426a7142626e6ae1db6ee478418d08a --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file defines the the class to partition a graph. + */ + +#pragma once +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_traits.h" +#include "paddle/fluid/framework/ir/node.h" + +namespace paddle { +namespace inference { +namespace analysis { +using framework::ir::Node; + +std::vector ExtractParameters( + const std::unordered_set &nodes); + +void RenameAndGetOutputs( + const std::vector &subgraph_nodes, + framework::BlockDesc *block_desc, + const std::set &input_names_with_id, + std::set *output_names_with_id, + std::set *output_names, + std::unordered_map *output_name_map, + bool is_trt = true); + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 69a9caec030600332c9f11ba255e4e642bd41e96..59399403276b59c143fc3e06a53643e0a85cf559 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -13,14 +13,15 @@ // limitations under the License. #include +#include #include -#include -#include #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/op_teller.h" #include "paddle/fluid/string/pretty_log.h" @@ -30,11 +31,7 @@ namespace analysis { using framework::ir::Node; -std::vector ExtractParameters( - const std::unordered_set &nodes); - std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( - std::unique_ptr graph) const { framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); @@ -47,9 +44,16 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( Get("min_subgraph_size") /*min subgraph size*/); fuser(); + std::vector graph_param_names = + ExtractParameters(graph->Nodes()); + // those parameter already exist in trt, and should not have another copy in + // fluid. + std::vector repetitive_params; + for (auto *node : graph->Nodes()) { if (node->IsOp() && !Agent(node).subgraph()->empty()) { - CreateTensorRTOp(node, graph.get()); + CreateTensorRTOp(node, graph.get(), graph_param_names, + &repetitive_params); std::unordered_set nodes2remove( Agent(node).subgraph()->begin(), Agent(node).subgraph()->end()); @@ -64,12 +68,15 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( } } framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); + graph->Set(framework::ir::kRepetitiveParamAttr, + new std::vector(repetitive_params)); return graph; } std::string GenerateEngineKey(const std::set &engine_inputs, - const std::set &engine_outputs) { + const std::set &engine_outputs, + const std::string &predictor_id) { std::string engine_hash_key = ""; for (auto name : engine_inputs) { engine_hash_key += name; @@ -77,12 +84,15 @@ std::string GenerateEngineKey(const std::set &engine_inputs, for (auto name : engine_outputs) { engine_hash_key += name; } + engine_hash_key += predictor_id; auto engine_key = std::to_string(std::hash()(engine_hash_key)); return engine_key; } -void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, - Graph *graph) const { +void TensorRtSubgraphPass::CreateTensorRTOp( + framework::ir::Node *node, Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const { auto *op_desc = node->Op(); auto &subgraph = *Agent(node).subgraph(); PADDLE_ENFORCE(!subgraph.empty()); @@ -116,12 +126,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, // is unique. std::set input_names; std::set input_names_with_id; + std::vector params; + + // The node->inputs containes input tensors and parameters. for (auto *x : node->inputs) { input_names.insert(x->Name()); input_names_with_id.insert(x->Name() + std::to_string(x->id())); + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) { + params.push_back(x->Name()); + } } - op_desc->SetInput( - "Xs", std::vector(input_names.begin(), input_names.end())); std::set output_names; std::set output_names_with_id; @@ -130,11 +144,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, output_names_with_id.insert(x->Name() + std::to_string(x->id())); } - op_desc->SetOutput( - "Ys", std::vector(output_names.begin(), output_names.end())); - op_desc->SetType("tensorrt_engine"); - std::unordered_map output_name_map; + auto &subgraph_nodes = *Agent(node).subgraph(); // The following procedure is used to rename all the intermediate // variables and the output variables of the subgraph. @@ -148,61 +159,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, // input of a OP, but also the output of a Op, there will be problems. // So we have to rename the variable in the subgraph to make sure // it is either an OP's input or an OP's output. - - auto &subgraph_nodes = *Agent(node).subgraph(); - for (size_t index = 0; index < block_desc.OpSize(); ++index) { - framework::proto::OpDesc *op = block_desc.Op(index)->Proto(); - auto correspond_node = subgraph_nodes[index]; - PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); - - std::unordered_map var2id; - for (auto *in_var : correspond_node->inputs) { - var2id[in_var->Name()] = in_var->id(); - } - // rename for the input variables of op inside subgraph - for (int i = 0; i < op->inputs_size(); i++) { - // one input - auto *in_var = op->mutable_inputs(i); - std::vector replaced_names; - for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments - std::string arg_value = in_var->arguments(k); - std::string arg_value_with_id = - arg_value + std::to_string(var2id[arg_value]); - if (input_names_with_id.count(arg_value_with_id)) { - replaced_names.push_back(arg_value); - } else { - replaced_names.push_back(arg_value_with_id); - } - } - in_var->clear_arguments(); - for (size_t k = 0; k < replaced_names.size(); k++) { - in_var->add_arguments(replaced_names[k]); - } - } - var2id.clear(); - for (auto out_var : correspond_node->outputs) { - var2id[out_var->Name()] = out_var->id(); - } - - // rename for the output variables of op inside subgraph - for (int i = 0; i < op->outputs_size(); i++) { - framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); - std::vector replaced_names; - for (int k = 0; k < out_var->arguments_size(); k++) { - std::string arg_value = out_var->arguments(k); - std::string arg_value_with_id = - arg_value + std::to_string(var2id[arg_value]); - if (output_names_with_id.count(arg_value_with_id)) { - output_name_map[arg_value] = arg_value_with_id; - } - replaced_names.push_back(arg_value_with_id); - } - out_var->clear_arguments(); - for (size_t k = 0; k < replaced_names.size(); k++) { - out_var->add_arguments(replaced_names[k]); - } - } - } + RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id, + &output_names_with_id, &output_names, &output_name_map); // When tensorrt engine runs at the end of the operation, // output_mapping help us copy the data from the renamed ITensor @@ -212,6 +170,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, PADDLE_ENFORCE(output_name_map.count(name) != 0); output_mapping.push_back(output_name_map[name]); } + PADDLE_ENFORCE(!output_mapping.empty()); auto *vars = block_desc.Proto()->mutable_vars(); for (framework::ir::Node *node : graph->Nodes()) { @@ -222,54 +181,104 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), "the block has no var-desc"); - PADDLE_ENFORCE(!output_mapping.empty()); + + // Set attrs + op_desc->SetType("tensorrt_engine"); + op_desc->SetInput( + "Xs", std::vector(input_names.begin(), input_names.end())); + + op_desc->SetOutput( + "Ys", std::vector(output_names.begin(), output_names.end())); + op_desc->SetBlockAttr("sub_block", new_block); SetAttr(op_desc->Proto(), "subgraph", block_desc.Proto()->SerializeAsString()); - // Set attrs SetAttr(op_desc->Proto(), "max_batch_size", Get("max_batch_size")); SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); - SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); + SetAttr(op_desc->Proto(), "parameters", params); auto enable_int8 = Get("enable_int8"); - auto engine_key = - GenerateEngineKey(input_names_with_id, output_names_with_id); - - std::string calibration_data = GetTrtCalibTableData( - Get("model_opt_cache_dir"), engine_key, enable_int8); + auto use_static_engine = Get("use_static_engine"); + auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id, + std::to_string(0)); + + // Get "" when there is no cached calibration table data. + bool load_from_memory = Get("model_from_memory"); + std::string calibration_data = ""; + if (enable_int8) { + calibration_data = GetTrtCalibTableData( + Get("model_opt_cache_dir"), engine_key, enable_int8); + } SetAttr(op_desc->Proto(), "calibration_data", calibration_data); SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "engine_key", engine_key); -} + std::string trt_engine_serialized_data = ""; -std::vector ExtractParameters( - const std::unordered_set &nodes) { - // We can judge whether a variable is a parameter by - // its presistable property, but sometimes the presistable - // of the feed op output is true, so we have to identify it. - std::vector feed_outputs; - for (const auto &node : nodes) { - if (!node->IsOp()) continue; - std::string op_type = node->Op()->Type(); - if (op_type == "feed") { - std::vector output_names = node->Op()->OutputArgumentNames(); - std::copy(output_names.begin(), output_names.end(), - std::back_inserter(feed_outputs)); - } + SetAttr(op_desc->Proto(), "engine_serialized_data", + trt_engine_serialized_data); + + std::unique_ptr calibrator; + if (enable_int8 && calibration_data.size() != 0) { + calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data)); + } + // When in int8 mode and calibration_mode, the program just produce the + // calibration table data. + bool calibration_mode = (enable_int8 && calibration_data.size() == 0); + if (calibration_mode) { + // calibraion mode means generate int8 calibration table data process. + return; } - std::vector parameters; - for (const auto &node : nodes) { - if (!node->IsVar()) continue; - if (node->Var()->Persistable() && - std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) == - feed_outputs.end()) { - parameters.push_back(node->Name()); + std::copy(params.begin(), params.end(), + std::back_inserter(*repetitive_params)); + bool need_serialize = (use_static_engine && !load_from_memory); + + if (need_serialize) { + trt_engine_serialized_data = GetTrtEngineSerializedData( + Get("model_opt_cache_dir"), engine_key); + // we can load the engine info serialized before from the disk. + if (!trt_engine_serialized_data.empty()) { + SetAttr(op_desc->Proto(), "engine_serialized_data", + trt_engine_serialized_data); + LOG(INFO) << "Load TRT Optimized Info from " + << GetTrtEngineSerializedPath( + Get("model_opt_cache_dir"), engine_key); + return; } } - return parameters; + + // the following code will NOT run in following situation: + // 1. calibraion mode (generate trt int8 calibraiton table data) + // 2. already load serialized trt engine info. + LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " + "kernel etc). This process may cost a lot of time."; + std::unique_ptr trt_engine( + new tensorrt::TensorRTEngine( + Get("max_batch_size"), Get("workspace_size"), enable_int8, + calibrator.get(), Get("gpu_device_id"))); + auto *scope = param_scope(); + framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); + std::unordered_set param_set(params.begin(), params.end()); + inference::Singleton::Global() + .ConvertBlockToTRTEngine( + &block_desc_temp, *scope, + std::vector(input_names.begin(), input_names.end()), + param_set, output_mapping, trt_engine.get()); + nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize(); + trt_engine_serialized_data = + std::string((const char *)serialized_engine_data->data(), + serialized_engine_data->size()); + + if (need_serialize) { + SaveTrtEngineSerializedDataToFile( + GetTrtEngineSerializedPath(Get("model_opt_cache_dir"), + engine_key), + trt_engine_serialized_data); + } + SetAttr(op_desc->Proto(), "engine_serialized_data", + trt_engine_serialized_data); } } // namespace analysis diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h index 502353b95fc15e763900a0caf1649257508f0880..f043670c5af39c1bdf8d4f00c7294fb53a4c9039 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h @@ -13,8 +13,14 @@ // limitations under the License. #pragma once -#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" namespace paddle { namespace inference { @@ -26,8 +32,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase { std::unique_ptr graph) const override; private: - void CreateTensorRTOp(framework::ir::Node *x, - framework::ir::Graph *graph) const; + void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const; void CleanIntermediateOutputs(framework::ir::Node *node); }; diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 8be2d3ac0b105e50fe619a720929dedaacb75537..d13ec7608c3e8075c1ef62fd4d47fbeee06e9005 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { // The parameters are on the cpu, therefore, synchronization is not necessary. if (!argument->use_gpu()) return; + auto &graph = argument->main_graph(); + std::vector repetitive_params; + + if (graph.Has(framework::ir::kRepetitiveParamAttr)) + repetitive_params = graph.Get>( + framework::ir::kRepetitiveParamAttr); + LOG(INFO) << "Sync params from CPU to GPU"; PADDLE_ENFORCE(argument->gpu_device_id_valid()); @@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { // Because there exists the case that new parameter variables are not added to // the program in the analysis pass. for (auto &var_name : all_vars) { + if (std::count(repetitive_params.begin(), repetitive_params.end(), + var_name)) { + continue; + } auto *var = scope->FindLocalVar(var_name); PADDLE_ENFORCE(var != nullptr); if (var->IsType() || diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h index a95f460df6f9636fc17a5cf76920f5f459385120..61990150a30db147418c4301359428cf3c6db541 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -17,6 +17,7 @@ #include #include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 85755fc471ae3d37ec5d005882668ccf0c35b354..882bb3468388e794e975d87de73537ac41f17cf7 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -27,15 +27,25 @@ if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter) endif() +if (ANAKIN_FOUND) + set(inference_deps ${inference_deps} anakin_op_converter anakin_engine) +endif() + add_subdirectory(details) -cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder) +if(WITH_MKLDNN) + set(mkldnn_quantizer_src mkldnn_quantizer.cc) + set(mkldnn_quantizer_cfg mkldnn_quantizer_config) + cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder) +endif() + +cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder) cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) -cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor +cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS paddle_inference_api zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps}) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config - analysis_config paddle_pass_builder zero_copy_tensor + paddle_pass_builder zero_copy_tensor reset_tensor_array) cc_test(test_paddle_inference_api diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 522ab495227e9b8c52b8d38db696fa9b785ba642..aee94e12340597e981ac385a01335d2ffa069191 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/platform/gpu_info.h" namespace paddle { +extern const std::vector kAnakinSubgraphPasses; PassStrategy *AnalysisConfig::pass_builder() const { if (!pass_builder_.get()) { @@ -103,9 +104,17 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(tensorrt_max_batchsize_); CP_MEMBER(tensorrt_min_subgraph_size_); CP_MEMBER(tensorrt_precision_mode_); + CP_MEMBER(trt_use_static_engine_); // MKLDNN related. CP_MEMBER(use_mkldnn_); CP_MEMBER(mkldnn_enabled_op_types_); + // Quantization related. + CP_MEMBER(use_mkldnn_quantizer_); + CP_MEMBER(mkldnn_quantizer_config_); + + CP_MEMBER(use_anakin_); + CP_MEMBER(anakin_max_batchsize_); + CP_MEMBER(anakin_max_input_shape_); // Ir related. CP_MEMBER(enable_ir_optim_); @@ -142,9 +151,29 @@ void AnalysisConfig::EnableMKLDNN() { Update(); } +void AnalysisConfig::EnableMkldnnQuantizer() { +#ifdef PADDLE_WITH_MKLDNN + if (!mkldnn_quantizer_config_) + mkldnn_quantizer_config_.reset(new MkldnnQuantizerConfig()); + use_mkldnn_quantizer_ = true; +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer"; + use_mkldnn_quantizer_ = false; +#endif + + Update(); +} + +std::shared_ptr AnalysisConfig::mkldnn_quantizer_config() + const { + PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_, + "MkldnnQuantizer was not enabled yet."); + return mkldnn_quantizer_config_; +} + void AnalysisConfig::EnableTensorRtEngine( int workspace_size, int max_batch_size, int min_subgraph_size, - AnalysisConfig::Precision precision_mode) { + AnalysisConfig::Precision precision_mode, bool use_static) { #ifdef PADDLE_WITH_CUDA if (!use_gpu()) { LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; @@ -156,6 +185,7 @@ void AnalysisConfig::EnableTensorRtEngine( tensorrt_max_batchsize_ = max_batch_size; tensorrt_min_subgraph_size_ = min_subgraph_size; tensorrt_precision_mode_ = precision_mode; + trt_use_static_engine_ = use_static; Update(); #else @@ -200,6 +230,7 @@ void AnalysisConfig::Update() { // Append after the Affine_channel_conv_fuse pass. pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); } + pass_builder()->DeletePass("runtime_context_cache_pass"); } if (use_mkldnn_) { @@ -216,10 +247,43 @@ void AnalysisConfig::Update() { #endif } + // Quantization passes must come after all other optimization passes + if (use_mkldnn_quantizer_) { + if (!enable_ir_optim_) { + LOG(ERROR) << "EnableMkldnnQuantizer() only works when IR optimization " + "is enabled."; + } +#ifdef PADDLE_WITH_MKLDNN + pass_builder()->EnableMkldnnQuantizer(); +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer"; + use_mkldnn_quantizer_ = false; +#endif + } + +#ifdef PADDLE_WITH_MKLDNN + // Do not optimize before quantization + if (enable_memory_optim_ && !use_mkldnn_quantizer_) { +#else if (enable_memory_optim_) { +#endif pass_builder()->AppendAnalysisPass("memory_optimize_pass"); } + if (use_anakin_) { + PADDLE_ENFORCE(!use_tensorrt_, + "Anakin sub-graph and TensorRT sub-graph are not allowed to " + "run at the same time!"); + PADDLE_ENFORCE( + use_gpu_, + "Anakin sub-graph engine need gpu, please use the EnableGpu API."); + + pass_builder()->ClearPasses(); + for (const auto &pass : kAnakinSubgraphPasses) { + pass_builder()->AppendPass(pass); + } + } + if (ir_debug_) { pass_builder()->TurnOnDebug(); } @@ -248,6 +312,7 @@ std::string AnalysisConfig::SerializeInfoCache() { for (auto &item : mkldnn_enabled_op_types_) ss << item; ss << ";"; + ss << use_mkldnn_quantizer_; ss << model_from_memory_; ss << enable_ir_optim_; @@ -256,7 +321,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << specify_input_name_; ss << cpu_math_library_num_threads_; - + ss << use_anakin_; return ss.str(); } @@ -306,6 +371,11 @@ void AnalysisConfig::SetModelBuffer(const char *prog_buffer, Update(); } +void AnalysisConfig::SetEngineOptInfo( + std::map engine_opt_info) { + engine_opt_info_ = engine_opt_info; +} + NativeConfig AnalysisConfig::ToNativeConfig() const { NativeConfig config; config.model_dir = model_dir_; @@ -322,5 +392,12 @@ void AnalysisConfig::SwitchIrDebug(int x) { ir_debug_ = x; Update(); } - +void AnalysisConfig::EnableAnakinEngine( + int max_batch_size, + std::map> max_input_shape) { + anakin_max_batchsize_ = max_batch_size; + anakin_max_input_shape_ = max_input_shape; + use_anakin_ = true; + Update(); +} } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 467d4411376381df950bb582f9c73410284a5e2d..f7260561547bb0bd7aea1590239e38090953f6fc 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_type.h" @@ -35,12 +36,20 @@ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/inference/api/mkldnn_quantizer.h" +#endif + #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" +#endif +#if PADDLE_WITH_ANAKIN +#include "paddle/fluid/inference/anakin/convert/op_converter.h" #endif DECLARE_bool(profile); @@ -243,6 +252,8 @@ bool AnalysisPredictor::SetFeed(const std::vector &inputs, input_ptr = input.mutable_data(ddim, place_); } else if (inputs[i].dtype == PaddleDType::FLOAT32) { input_ptr = input.mutable_data(ddim, place_); + } else if (inputs[i].dtype == PaddleDType::INT32) { + input_ptr = input.mutable_data(ddim, place_); } else { LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; return false; @@ -326,17 +337,17 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, } else if (type == framework::proto::VarType::INT64) { GetFetchOne(fetch, output); output->dtype = PaddleDType::INT64; + } else if (type == framework::proto::VarType::INT32) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::INT32; } else { - LOG(ERROR) << "unknown type, only support float32 and int64 now."; + LOG(ERROR) << "unknown type, only support float32, int64 and int32 now."; } } return true; } -// NOTE All the members in AnalysisConfig should be copied to Argument. -void AnalysisPredictor::OptimizeInferenceProgram() { - status_program_optimized_ = true; - +void AnalysisPredictor::PrepareArgument() { argument_.SetUseGPU(config_.use_gpu()); argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); @@ -344,7 +355,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetStaticMemoryOptimForceUpdate( config_.static_memory_optim_force_update_); argument_.SetModelFromMemory(config_.model_from_memory_); + argument_.SetEngineOptInfo(config_.engine_opt_info_); // Analyze inference_program + argument_.SetUseAnakin(config_.anakin_engine_enabled()); + argument_.SetPredictorID(predictor_id_); if (!config_.model_dir().empty()) { argument_.SetModelDir(config_.model_dir()); } else { @@ -365,6 +379,13 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); + argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_); + } + + if (config_.use_gpu() && config_.anakin_engine_enabled()) { + argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_); + argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_); + LOG(INFO) << "Anakin subgraph engine is enabled"; } if (config_.use_mkldnn_) { @@ -372,6 +393,16 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_); } +#ifdef PADDLE_WITH_MKLDNN + if (config_.mkldnn_quantizer_enabled()) { + LOG(INFO) << "Quantization is enabled"; + argument_.SetQuantizeEnabledOpTypes( + config_.mkldnn_quantizer_config()->enabled_op_types()); + argument_.SetQuantizeExcludedOpIds( + config_.mkldnn_quantizer_config()->excluded_op_ids()); + } +#endif + auto passes = config_.pass_builder()->AllPasses(); if (!config_.ir_optim()) { passes.clear(); @@ -380,6 +411,13 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetIrAnalysisPasses(passes); argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses()); argument_.SetScopeNotOwned(scope_.get()); +} + +// NOTE All the members in AnalysisConfig should be copied to Argument. +void AnalysisPredictor::OptimizeInferenceProgram() { + status_program_optimized_ = true; + + PrepareArgument(); Analyzer().Run(&argument_); PADDLE_ENFORCE(argument_.scope_valid()); @@ -396,7 +434,7 @@ std::unique_ptr CreatePaddlePredictor< VLOG(3) << "create AnalysisConfig"; if (config.use_gpu()) { // 1. GPU memory - PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f); + PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f); PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d", config.gpu_device_id()); std::vector flags; @@ -421,12 +459,31 @@ std::unique_ptr CreatePaddlePredictor< } std::unique_ptr predictor(new AnalysisPredictor(config)); - if (!dynamic_cast(predictor.get())->Init(nullptr)) { + auto predictor_p = dynamic_cast(predictor.get()); + + if (!predictor_p->Init(nullptr)) { + return nullptr; + } + + if (config.mkldnn_quantizer_enabled() && !predictor_p->MkldnnQuantize()) { return nullptr; } + return predictor; } +bool AnalysisPredictor::MkldnnQuantize() { +#if PADDLE_WITH_MKLDNN + if (!mkldnn_quantizer_) + mkldnn_quantizer_ = new AnalysisPredictor::MkldnnQuantizer( + *this, config_.mkldnn_quantizer_config()); + return mkldnn_quantizer_->Quantize(); +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer"; + return false; +#endif +} + void AnalysisPredictor::PrepareFeedFetch() { PADDLE_ENFORCE_NOT_NULL(sub_scope_); CreateFeedFetchVar(sub_scope_); @@ -438,12 +495,14 @@ void AnalysisPredictor::PrepareFeedFetch() { } feeds_[idx] = op; feed_names_[op->Output("Out")[0]] = idx; + idx2feeds_[idx] = op->Output("Out")[0]; } else if (op->Type() == "fetch") { int idx = boost::get(op->GetAttr("col")); if (fetches_.size() <= static_cast(idx)) { fetches_.resize(idx + 1); } fetches_[idx] = op; + idx2fetches_[idx] = op->Input("X")[0]; } } } @@ -456,6 +515,22 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) { var->GetMutable(); } +std::vector AnalysisPredictor::GetInputNames() { + std::vector input_names; + for (auto &item : idx2feeds_) { + input_names.push_back(item.second); + } + return input_names; +} + +std::vector AnalysisPredictor::GetOutputNames() { + std::vector output_names; + for (auto &item : idx2fetches_) { + output_names.push_back(item.second); + } + return output_names; +} + std::unique_ptr AnalysisPredictor::GetInputTensor( const std::string &name) { PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); @@ -463,6 +538,13 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( new ZeroCopyTensor(static_cast(executor_->scope()))); res->input_or_output_ = true; res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = boost::get(place_); + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } + return res; } @@ -473,6 +555,12 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( new ZeroCopyTensor(static_cast(executor_->scope()))); res->input_or_output_ = false; res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = boost::get(place_); + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } return res; } @@ -654,6 +742,13 @@ AnalysisPredictor::~AnalysisPredictor() { scope_->DeleteScope(sub_scope_); } +#if PADDLE_WITH_MKLDNN + if (mkldnn_quantizer_) { + delete mkldnn_quantizer_; + mkldnn_quantizer_ = nullptr; + } +#endif + // TODO(Superjomn) deduce the directory path. std::string out_path = inference::analysis::GetMemoryCachePath( config_.model_dir(), config_.prog_file()); @@ -768,3 +863,27 @@ USE_TRT_CONVERTER(prelu); USE_TRT_CONVERTER(conv2d_transpose); USE_TRT_CONVERTER(leaky_relu); #endif + +#if PADDLE_WITH_ANAKIN +USE_ANAKIN_CONVERTER(mul); +USE_ANAKIN_CONVERTER(fc); +USE_ANAKIN_CONVERTER(conv2d); +USE_ANAKIN_CONVERTER(conv2d_fusion); +USE_ANAKIN_CONVERTER(concat); +USE_ANAKIN_CONVERTER(split); +USE_ANAKIN_CONVERTER(relu); +USE_ANAKIN_CONVERTER(sigmoid); +USE_ANAKIN_CONVERTER(tanh); +USE_ANAKIN_CONVERTER(pool2d); +USE_ANAKIN_CONVERTER(elementwise_add); +USE_ANAKIN_CONVERTER(elementwise_mul); +USE_ANAKIN_CONVERTER(batch_norm); +USE_ANAKIN_CONVERTER(flatten); +USE_ANAKIN_CONVERTER(reshape); +USE_ANAKIN_CONVERTER(transpose); +USE_ANAKIN_CONVERTER(softmax); +USE_ANAKIN_CONVERTER(detection_out); +USE_ANAKIN_CONVERTER(density_prior_box); +USE_ANAKIN_CONVERTER(dropout); +USE_ANAKIN_CONVERTER(sum); +#endif diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index d5445c58e45ae64a8cfab03cb610e3677729338b..e4c537f426650f16ced32d3cb61b944a78c35b43 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -15,12 +15,14 @@ #pragma once #include #include +#include #include #include #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/details/reset_tensor_array.h" +#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/string/printf.h" #ifdef PADDLE_WITH_TESTING @@ -43,7 +45,9 @@ using framework::NaiveExecutor; */ class AnalysisPredictor : public PaddlePredictor { public: - explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {} + explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) { + predictor_id_ = inference::GetUniqueId(); + } ~AnalysisPredictor(); bool Init(const std::shared_ptr &parent_scope, @@ -53,6 +57,9 @@ class AnalysisPredictor : public PaddlePredictor { std::vector *output_data, int batch_size = -1) override; + std::vector GetInputNames(); + std::vector GetOutputNames(); + std::unique_ptr GetInputTensor( const std::string &name) override; std::unique_ptr GetOutputTensor( @@ -63,6 +70,7 @@ class AnalysisPredictor : public PaddlePredictor { void CreateFeedFetchVar(framework::Scope *scope); void PrepareFeedFetch(); + void PrepareArgument(); void OptimizeInferenceProgram(); Argument &analysis_argument() { return argument_; } @@ -76,6 +84,8 @@ class AnalysisPredictor : public PaddlePredictor { std::string GetSerializedProgram() const override; + bool MkldnnQuantize(); + protected: // For memory optimization. bool need_collect_var_shapes_for_memory_optim(); @@ -131,7 +141,21 @@ class AnalysisPredictor : public PaddlePredictor { std::shared_ptr inference_program_; std::vector feeds_; std::map feed_names_; + // Sorted according to the idx. + std::map idx2feeds_; std::vector fetches_; + std::map idx2fetches_; + +#if PADDLE_WITH_MKLDNN + // Helper class to perform quantization + class MkldnnQuantizer; + MkldnnQuantizer *mkldnn_quantizer_{nullptr}; + +#if PADDLE_WITH_TESTING + friend class MkldnnQuantizerTest; +#endif +#endif + // Memory buffer for feed inputs. The temporary LoDTensor will cause serious // concurrency problems, wrong results and memory leak, so cache them. std::vector feed_tensors_; @@ -143,6 +167,7 @@ class AnalysisPredictor : public PaddlePredictor { const size_t max_shape_collect_count_{1000}; int need_collect_var_shapes_{-1}; // -1 for default, 0 for false, 1 for true. std::vector>> batch_var_shapes_; + int predictor_id_; private: // Some status here that help to determine the status inside the predictor. diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 6696839b53fb21c274843afd86b5d8b5c2042c51..0429a287c74f9db5257181151d90b77da86c694c 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -17,9 +17,13 @@ #include #include // NOLINT #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/inference/api/mkldnn_quantizer.h" +#endif DEFINE_string(dirname, "", "dirname to tests."); @@ -243,4 +247,241 @@ TEST(AnalysisPredictor, memory_optim) { inference::CompareResult(output, output1); } +#ifdef PADDLE_WITH_MKLDNN +class MkldnnQuantizerTest : public testing::Test { + public: + MkldnnQuantizerTest() { + AnalysisConfig config(FLAGS_dirname); + + predictor.reset(new AnalysisPredictor(config)); + auto* predictor_p = static_cast(predictor.get()); + + auto qconfig = std::make_shared(); + + mkldnn_quantizer.reset( + new AnalysisPredictor::MkldnnQuantizer(*predictor_p, qconfig)); + } + + std::pair, float> Histogram( + const framework::LoDTensor& var_tensor, float min_val, float max_val, + int num_bins) const { + return mkldnn_quantizer->Histogram(var_tensor, min_val, max_val, num_bins); + } + + std::pair GetMaxScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const { + return mkldnn_quantizer->GetMaxScalingFactor(var_tensor, is_unsigned); + } + + std::pair GetMaxChScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const { + return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned); + } + + std::pair GetKLScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const { + return mkldnn_quantizer->GetKLScalingFactor(var_tensor, is_unsigned); + } + + protected: + std::unique_ptr predictor; + std::unique_ptr mkldnn_quantizer; + float abs_error = 1e-6; + static const std::array non_negative_values; + static const std::array positive_and_negative_values; +}; + +const std::array MkldnnQuantizerTest::non_negative_values = { + 0.0158671, 0.026459, 0.0280772, 0.00962479, 0.0131628, + 0.016704, 0.00118407, 0.00765726, 0.0123213, 0.00944741}; +const std::array MkldnnQuantizerTest::positive_and_negative_values = + {-0.0482659, -0.0102493, -0.00794221, -0.00387115, -0.00674586, + -0.0495346, 0.0629528, -0.00531285, -0.0230353, 0.0269089}; + +TEST_F(MkldnnQuantizerTest, histogram_inverted_min_max) { + const auto& values = non_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + ASSERT_THROW(Histogram(var_tensor, max_val, min_val, 3), + platform::EnforceNotMet); +} + +TEST_F(MkldnnQuantizerTest, histogram_non_negative_to_3) { + // all non-negative values + const auto& values = non_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + std::vector histogram; + float bin_width; + + std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3); + + ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.f, abs_error) + << "Improperly calculated bin_width."; + + ASSERT_EQ(histogram[0], 4); + ASSERT_EQ(histogram[1], 4); + ASSERT_EQ(histogram[2], 2); +} + +TEST_F(MkldnnQuantizerTest, histogram_positive_and_negative_to_3) { + const auto& values = positive_and_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + std::vector histogram; + float bin_width; + + std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3); + + ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.0f, abs_error) + << "Improperly calculated bin_width."; + + ASSERT_EQ(histogram[0], 3); + ASSERT_EQ(histogram[1], 5); + ASSERT_EQ(histogram[2], 2); +} + +TEST_F(MkldnnQuantizerTest, histogram_zero_bins) { + const auto& values = non_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + ASSERT_THROW(Histogram(var_tensor, min_val, max_val, 0), + platform::EnforceNotMet); +} + +TEST_F(MkldnnQuantizerTest, histogram_empty) { + // empty tensor + ASSERT_THROW(Histogram({}, -1, 1, 1), platform::EnforceNotMet); + + // zero tensor + framework::LoDTensor var_tensor; + var_tensor.Resize({0}); + ASSERT_TRUE(var_tensor.mutable_data(platform::CPUPlace())); + + ASSERT_THROW(Histogram(var_tensor, -1, 1, 1), platform::EnforceNotMet); +} + +TEST_F(MkldnnQuantizerTest, kl_scaling_factor_signed) { + const auto& values = positive_and_negative_values; + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, false); + + ASSERT_EQ(is_unsigned, false); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data()[0], 1.0 / 0.0899106152344, abs_error); +} + +TEST_F(MkldnnQuantizerTest, max_scaling_factor_signed) { + const auto& values = positive_and_negative_values; + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, false); + + ASSERT_EQ(is_unsigned, false); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data()[0], 1.0 / max_val, abs_error); +} + +TEST_F(MkldnnQuantizerTest, max_scaling_factor_unsigned) { + const auto& values = non_negative_values; + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, true); + + ASSERT_EQ(is_unsigned, true); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data()[0], 1.0 / max_val, abs_error); +} + +TEST_F(MkldnnQuantizerTest, max_scaling_factor_chwise_unsigned) { + const auto& values = non_negative_values; + auto max_val = *std::max_element(values.begin(), values.end()); + int channels = 3; + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(channels, 1, 1, values.size())); + for (int i = 0; i < channels; i++) + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace()) + + i * values.size()); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetMaxChScalingFactor(var_tensor, true); + + ASSERT_EQ(is_unsigned, true); + ASSERT_EQ(lod_tensor.numel(), channels); + for (int i = 0; i < channels; i++) { + ASSERT_NEAR(lod_tensor.data()[i], 1.0 / max_val, abs_error); + } +} + +TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) { + const auto& values = non_negative_values; + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, true); + + ASSERT_EQ(is_unsigned, true); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data()[0], 1.0 / 0.0252845321362, abs_error); +} +#endif + } // namespace paddle diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index f83537f064187e67a08c8bbce52707d1c824abeb..7d57b6ec74468dbdb0519f85140629a0ac01c18d 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -28,6 +28,8 @@ int PaddleDtypeSize(PaddleDType dtype) { return sizeof(float); case PaddleDType::INT64: return sizeof(int64_t); + case PaddleDType::INT32: + return sizeof(int32_t); default: assert(false); return -1; diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 048286a843f0190a8139cb86eda4f3a3a40d89a1..54f40563c3662af24e794422be4d3262d86c76a7 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -203,6 +203,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, input_ptr = input.mutable_data(ddim, place_); } else if (inputs[i].dtype == PaddleDType::FLOAT32) { input_ptr = input.mutable_data(ddim, place_); + } else if (inputs[i].dtype == PaddleDType::INT32) { + input_ptr = input.mutable_data(ddim, place_); } else { LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; return false; @@ -281,8 +283,11 @@ bool NativePaddlePredictor::GetFetch(std::vector *outputs, } else if (type == framework::DataTypeTrait::DataType) { GetFetchOne(fetch, output); output->dtype = PaddleDType::INT64; + } else if (type == framework::DataTypeTrait::DataType) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::INT32; } else { - LOG(ERROR) << "unknown type, only support float32 and int64 now."; + LOG(ERROR) << "unknown type, only support float32, int64 and int32 now."; } } return true; diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index e82cb53bf073d3d1ab9a518218edaf430728463f..2dc5dda34d02c6df9c0ccbc47a1ac960e1aca3f5 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -42,6 +42,9 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { } else if (t->type() == framework::proto::VarType::FP32) { pt.data.Reset(t->data(), t->numel() * sizeof(float)); pt.dtype = PaddleDType::FLOAT32; + } else if (t->type() == framework::proto::VarType::INT32) { + pt.data.Reset(t->data(), t->numel() * sizeof(int32_t)); + pt.dtype = PaddleDType::INT32; } else { LOG(FATAL) << "unsupported type."; } diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 963986f245cdafa737d76953f0e5323e4f74e669..bf2e3593c2beadaea2cb08aa3dcc2370c3e06bf4 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -27,7 +27,7 @@ if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then fi PREFIX=inference-vis-demos%2F -URL_ROOT=http://paddlemodels.cdn.bcebos.com/${PREFIX} +URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX} # download vis_demo data function download() { diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h index d70c6aea791219a40c3164b51499f9d5e562be71..1505a898c5bba285b377203c1503b8615666b196 100644 --- a/paddle/fluid/inference/api/demo_ci/utils.h +++ b/paddle/fluid/inference/api/demo_ci/utils.h @@ -88,13 +88,20 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) { } break; } - case PaddleDType::FLOAT32: + case PaddleDType::FLOAT32: { for (size_t i = 0; i < numel; ++i) { CHECK_LT( fabs(static_cast(output.data.data())[i] - refer.data[i]), 1e-5); } break; + } + case PaddleDType::INT32: { + for (size_t i = 0; i < numel; ++i) { + CHECK_EQ(static_cast(output.data.data())[i], refer.data[i]); + } + break; + } } } @@ -113,11 +120,18 @@ static std::string SummaryTensor(const PaddleTensor& tensor) { } break; } - case PaddleDType::FLOAT32: + case PaddleDType::FLOAT32: { for (int i = 0; i < std::min(num_elems, 10); i++) { ss << static_cast(tensor.data.data())[i] << " "; } break; + } + case PaddleDType::INT32: { + for (int i = 0; i < std::min(num_elems, 10); i++) { + ss << static_cast(tensor.data.data())[i] << " "; + } + break; + } } return ss.str(); } diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index f60ff40c5da3e9e03c2cb3583263394cb82db805..937b6398f8131a6cf4e8b0002e38f4513f0f884f 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -73,12 +74,88 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const { return res; } +PaddleDType ZeroCopyTensor::type() const { + EAGER_GET_TENSOR; + auto type = tensor->type(); + if (type == framework::proto::VarType::FP32) { + return PaddleDType::FLOAT32; + } else if (type == framework::proto::VarType::INT64) { + return PaddleDType::INT64; + } else if (type == framework::proto::VarType::INT32) { + return PaddleDType::INT32; + } else { + LOG(ERROR) << "unknown type, only support float32 and int64 now."; + } + return PaddleDType::FLOAT32; +} + +template +void ZeroCopyTensor::copy_from_cpu(const T *data) { + EAGER_GET_TENSOR; + PADDLE_ENFORCE_GE( + tensor->numel(), 0, + "You should call ZeroCopyTensor::Reshape(const std::vector &shape)" + "function before copy data from cpu."); + size_t ele_size = tensor->numel() * sizeof(T); + + if (place_ == PaddlePlace::kCPU) { + auto *t_data = tensor->mutable_data(platform::CPUPlace()); + std::memcpy(static_cast(t_data), data, ele_size); + } else { +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + platform::CUDAPlace gpu_place(device_); + auto *t_data = tensor->mutable_data(gpu_place); + auto *dev_ctx = + static_cast(pool.Get(gpu_place)); + + memory::Copy(gpu_place, static_cast(t_data), platform::CPUPlace(), + data, ele_size, dev_ctx->stream()); +#else + PADDLE_THROW("Not compile with CUDA, should not reach here."); +#endif + } +} + +template +void ZeroCopyTensor::copy_to_cpu(T *data) { + EAGER_GET_TENSOR; + auto ele_num = tensor->numel(); + auto *t_data = tensor->data(); + auto t_place = tensor->place(); + + if (platform::is_cpu_place(t_place)) { + std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); + } else { +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto gpu_place = boost::get(t_place); + auto *dev_ctx = + static_cast(pool.Get(gpu_place)); + memory::Copy(platform::CPUPlace(), static_cast(data), gpu_place, + t_data, ele_num * sizeof(T), dev_ctx->stream()); + cudaDeviceSynchronize(); +#else + PADDLE_THROW("Not compile with CUDA, should not reach here."); +#endif + } +} +template void ZeroCopyTensor::copy_from_cpu(const float *data); +template void ZeroCopyTensor::copy_from_cpu(const int64_t *data); +template void ZeroCopyTensor::copy_from_cpu(const int32_t *data); +template void ZeroCopyTensor::copy_to_cpu(float *data); +template void ZeroCopyTensor::copy_to_cpu(int64_t *data); +template void ZeroCopyTensor::copy_to_cpu(int32_t *data); + template float *ZeroCopyTensor::data(PaddlePlace *place, int *size) const; template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size) const; +template int32_t *ZeroCopyTensor::data(PaddlePlace *place, + int *size) const; template float *ZeroCopyTensor::mutable_data(PaddlePlace place); template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int32_t *ZeroCopyTensor::mutable_data(PaddlePlace place); void *ZeroCopyTensor::FindTensor() const { PADDLE_ENFORCE(!name_.empty(), @@ -92,10 +169,10 @@ void *ZeroCopyTensor::FindTensor() const { return tensor; } -std::vector ZeroCopyTensor::shape() const { +std::vector ZeroCopyTensor::shape() const { EAGER_GET_TENSOR; PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_); - return framework::vectorize(tensor->dims()); + return framework::vectorize2int(tensor->dims()); } void ZeroCopyTensor::SetLoD(const std::vector> &x) { diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc index 12071e09f8442f2c52a06b7c3fe4bed2c28b524a..cbbb3ea2d1395acdf4c460bea4b7868c31a20e53 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc @@ -37,7 +37,7 @@ template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); void *ZeroCopyTensor::FindTensor() const { return nullptr; } -std::vector ZeroCopyTensor::shape() const { return {}; } +std::vector ZeroCopyTensor::shape() const { return {}; } void ZeroCopyTensor::SetLoD(const std::vector> &x) {} diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index b92781e4f2c612cbb39fcaa7c80b6051a67215fd..258a79fa4e884177490fab79778151ae52537aa0 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -50,6 +50,11 @@ class Timer { } }; +static int GetUniqueId() { + static int id = 0; + return id++; +} + static void split(const std::string &str, char sep, std::vector *pieces) { pieces->clear(); @@ -81,6 +86,13 @@ static void split_to_int64(const std::string &str, char sep, std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is), [](const std::string &v) { return std::stoi(v); }); } +static void split_to_int(const std::string &str, char sep, + std::vector *is) { + std::vector pieces; + split(str, sep, &pieces); + std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is), + [](const std::string &v) { return std::stoi(v); }); +} template std::string to_string(const std::vector &vec) { std::stringstream ss; @@ -127,9 +139,8 @@ static void TensorAssignData(PaddleTensor *tensor, } template -static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, - const std::vector> &data) { - int size{0}; +static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, + const std::vector> &data) { auto *ptr = tensor->mutable_data(PaddlePlace::kCPU); int c = 0; for (const auto &f : data) { @@ -137,7 +148,15 @@ static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, ptr[c++] = v; } } - return size; +} + +template +static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, + const PaddleBuf &data) { + auto *ptr = tensor->mutable_data(PaddlePlace::kCPU); + for (size_t i = 0; i < data.length() / sizeof(T); i++) { + ptr[i] = *(reinterpret_cast(data.data()) + i); + } } static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) { @@ -197,6 +216,9 @@ static std::string DescribeTensor(const PaddleTensor &tensor, case PaddleDType::INT64: os << "int64"; break; + case PaddleDType::INT32: + os << "int32"; + break; default: os << "unset"; } diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc new file mode 100644 index 0000000000000000000000000000000000000000..de75e884f53143d9026636ad8663d89a36a30f69 --- /dev/null +++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc @@ -0,0 +1,437 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/mkldnn_quantizer.h" +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { + +using platform::CPUPlace; +using framework::LoDTensor; +using framework::ir::Graph; +using ConstEigenVectorArrayMap = + Eigen::Map>; +using string::PrettyLogH1; + +bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() { + PrettyLogH1("--- Calculating scales for quantization"); + using VariableNameMap = std::map>; + std::map> gathered_data; + for (const auto* op : predictor_.inference_program_->Block(0).AllOps()) { + if (op->HasAttr("use_quantizer") && + boost::get(op->GetAttr("use_quantizer"))) { + const VariableNameMap& connections_in = op->Inputs(); + const VariableNameMap& connections_out = op->Outputs(); + + auto glambda = [&](const VariableNameMap& connections, bool is_output) { + for (auto const& conn : connections) { + if (conn.second.size() == 0) continue; + auto& var_name = conn.second[0]; + + // skip if scale already computed + if (scales_.find(var_name) != scales_.end()) return; + + auto* var = predictor_.sub_scope_->FindVar(var_name); + PADDLE_ENFORCE(var, "%s is not in the scope", var_name); + PADDLE_ENFORCE(var->IsType(), + "Only support lod tensor now."); + LoDTensor* var_tensor = var->GetMutable(); + + // force unsigned type if already know it + bool is_unsigned = false; + if (is_output && op->Type() == "conv2d") { + // output of conv2d with relu must be unsigned + is_unsigned = op->HasAttr("fuse_relu") && + boost::get(op->GetAttr("fuse_relu")); + } else if (is_output && op->Type() == "pool2d") { + // output of pool2d with unsigned input must be unsigned + auto input_var_name = op->Input("X")[0]; + if (scales_.find(input_var_name) != scales_.end()) { + is_unsigned = scales_[input_var_name].first; + } + } + + CalculateSingleScale(op->Type(), conn.first, var_name, *var_tensor, + is_unsigned); + } + }; + + // handle outputs first so unsigned outputs could be inferred + glambda(connections_out, true /* is_output */); + glambda(connections_in, false /* is_output */); + } + } + + return true; +} + +void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale( + const std::string& op_type_name, const std::string& conn_name, + const std::string& var_name, const LoDTensor& var_tensor, + bool is_unsigned) { + auto rule = qconfig_->scale_algo(op_type_name, conn_name); + if (rule == ScaleAlgo::NONE) return; + + PADDLE_ENFORCE( + var_tensor.numel() > 0, + "MkldnnQuantizer: LoDTensor of variable %s for quantization of op " + "%s of connection %s should not be empty.", + var_name, op_type_name, conn_name); + + switch (rule) { + case ScaleAlgo::MAX: + scales_[var_name] = GetMaxScalingFactor(var_tensor, is_unsigned); + break; + case ScaleAlgo::MAX_CH: + scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned); + break; + case ScaleAlgo::KL: + scales_[var_name] = GetKLScalingFactor(var_tensor, is_unsigned); + break; + default: + throw std::runtime_error( + "MkldnnQuantizer: Unexpected ScaleAlgo specified."); + } +} + +std::vector AnalysisPredictor::MkldnnQuantizer::ExpandQuantizedBins( + std::vector quantized_bins, std::vector reference_bins) const { + std::vector expanded_quantized_bins(reference_bins.size(), 0); + int num_merged_bins = reference_bins.size() / quantized_bins.size(); + int j_start = 0; + int j_end = num_merged_bins; + for (size_t idx = 0; idx < quantized_bins.size(); idx++) { + int zero_count = + std::count(&reference_bins[j_start], &reference_bins[j_end], 0); + num_merged_bins = j_end - j_start; + int avg_bin_ele; + if (zero_count == num_merged_bins) { + avg_bin_ele = 0; + } else { + avg_bin_ele = quantized_bins[idx] / (num_merged_bins - zero_count + 0.0); + } + for (int idx1 = j_start; idx1 < j_end; idx1++) { + expanded_quantized_bins[idx1] = + (reference_bins[idx1] == 0) ? 0 : avg_bin_ele; + } + j_start += num_merged_bins; + j_end += num_merged_bins; + if ((idx + 1) == quantized_bins.size() - 1) { + j_end = reference_bins.size(); + } + } + return expanded_quantized_bins; +} + +std::pair +AnalysisPredictor::MkldnnQuantizer::GetKLScalingFactor( + const LoDTensor& var_tensor, bool is_unsigned) const { + ConstEigenVectorArrayMap eigen_tensor{var_tensor.data(), + var_tensor.numel(), 1}; + int precision_hist_num_bins = 2048; + float max_val = eigen_tensor.maxCoeff(); + float min_val = eigen_tensor.minCoeff(); + bool is_positive = min_val >= 0.0f; + if (is_unsigned) + PADDLE_ENFORCE( + is_positive, + "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0", + min_val); + + int num_quantized_bins = 255; + + std::vector hist; + float bin_width; + int starting_iter; + int ending_iter = precision_hist_num_bins - 1; + if (is_positive) { + std::tie(hist, bin_width) = + Histogram(var_tensor, min_val, max_val, precision_hist_num_bins); + starting_iter = static_cast(ending_iter * 0.7); + } else { + float th = std::max(std::abs(max_val), std::abs(min_val)); + std::tie(hist, bin_width) = + Histogram(var_tensor, -th, th, precision_hist_num_bins); + starting_iter = 0; + if (std::abs(max_val) > std::abs(min_val)) { + while (starting_iter < ending_iter) { + if (hist[starting_iter] == 0) { + ++starting_iter; + continue; + } else { + break; + } + } + starting_iter += static_cast((ending_iter - starting_iter) * 0.6); + } else { + while (ending_iter > 0) { + if (hist[ending_iter] == 0) { + --ending_iter; + continue; + } else { + break; + } + } + starting_iter = static_cast(0.6 * ending_iter); + } + } + auto P_sum = eigen_tensor.size(); + int min_kl_divergence = 0; + int min_kl_index = 0; + bool kl_inited = false; + for (int i = starting_iter; i <= ending_iter; i++) { + std::vector reference_distr_P(&hist[0], &hist[i]); + auto outliers_count = + std::accumulate(&hist[i], &hist[precision_hist_num_bins], 0); + if (reference_distr_P[i - 1] == 0) { + continue; + } + reference_distr_P[i - 1] += outliers_count; + auto reference_distr_bins = reference_distr_P; + std::vector candidate_distr_Q(&hist[0], &hist[i]); + int num_merged_bins = i / num_quantized_bins; + std::vector candidate_distr_Q_quantized(num_quantized_bins, 0); + int j_start = 0; + int j_end = num_merged_bins; + for (int idx = 0; idx < num_quantized_bins; idx++) { + candidate_distr_Q_quantized[idx] = std::accumulate( + &candidate_distr_Q[j_start], &candidate_distr_Q[j_end], 0); + j_start += num_merged_bins; + j_end += num_merged_bins; + if ((idx + 1) == num_quantized_bins - 1) { + j_end = i; + } + } + candidate_distr_Q = + ExpandQuantizedBins(candidate_distr_Q_quantized, reference_distr_bins); + int Q_sum = + std::accumulate(candidate_distr_Q.begin(), candidate_distr_Q.end(), 0); + auto kl_divergence = + SafeEntropy(reference_distr_P, P_sum, candidate_distr_Q, Q_sum); + if (!kl_inited) { + min_kl_divergence = kl_divergence; + min_kl_index = i; + kl_inited = true; + } else if (kl_divergence < min_kl_divergence) { + min_kl_divergence = kl_divergence; + min_kl_index = i; + } else { + } + } + if (min_kl_index == 0) { + while (starting_iter > 0) { + if (hist[starting_iter] == 0) { + starting_iter -= 1; + continue; + } else { + break; + } + } + min_kl_index = starting_iter; + } + + LoDTensor scale_tensor; + scale_tensor.Resize({1}); + auto* scale_ptr = scale_tensor.mutable_data(CPUPlace()); + + scale_ptr[0] = 1.0 / ((min_kl_index + 0.5) * bin_width); + + return std::make_pair(is_unsigned, scale_tensor); +} + +std::pair +AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor( + const LoDTensor& var_tensor, bool is_unsigned) const { + ConstEigenVectorArrayMap eigen_tensor{var_tensor.data(), + var_tensor.numel(), 1}; + float max_abs = eigen_tensor.abs().maxCoeff(); + float min_val = eigen_tensor.minCoeff(); + if (is_unsigned) + PADDLE_ENFORCE( + min_val >= 0.0f, + "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0", + min_val); + + LoDTensor scale_tensor; + scale_tensor.Resize({1}); + auto* scale_ptr = scale_tensor.mutable_data(CPUPlace()); + scale_ptr[0] = 1.0 / max_abs; + + return std::make_pair(is_unsigned, scale_tensor); +} + +std::pair +AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor( + const LoDTensor& var_tensor, bool is_unsigned) const { + PADDLE_ENFORCE(var_tensor.dims().size() > 0, "Tensor dimension is empty."); + + ConstEigenVectorArrayMap eigen_tensor{var_tensor.data(), + var_tensor.numel(), 1}; + float min_val = eigen_tensor.minCoeff(); + if (is_unsigned) + PADDLE_ENFORCE( + min_val >= 0.0f, + "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0", + min_val); + + int channels = var_tensor.dims()[0]; + LoDTensor scale_tensor; + scale_tensor.Resize({channels}); + auto* scale_ptr = scale_tensor.mutable_data(CPUPlace()); + + for (int i = 0; i < channels; ++i) { + const auto tensor = var_tensor.Slice(i, i + 1); + + ConstEigenVectorArrayMap eigen_tensor{tensor.data(), tensor.numel(), + 1}; + float max_abs = eigen_tensor.abs().maxCoeff(); + scale_ptr[i] = 1.0 / max_abs; + } + + return std::make_pair(is_unsigned, scale_tensor); +} + +std::pair, float> +AnalysisPredictor::MkldnnQuantizer::Histogram( + const framework::LoDTensor& var_tensor, float min_val, float max_val, + size_t num_bins) const { + PADDLE_ENFORCE_GT(num_bins, 0, + "MkldnnQuantizer: To calculate Histogram, num_bins (" + + std::to_string(num_bins) + ") must be positive."); + PADDLE_ENFORCE_GT( + var_tensor.numel(), 0, + "MkldnnQuantizer: To calculate Histogram, the tensor must not be empty."); + PADDLE_ENFORCE(max_val >= min_val, + "MkldnnQuantizer: To calculate Histogram, max_val (" + + std::to_string(max_val) + + ") must be greater or equal" + "to min_val (" + + std::to_string(min_val) + ")."); + ConstEigenVectorArrayMap eigen_tensor{var_tensor.data(), + var_tensor.numel(), 1}; + auto bin_width = std::abs(max_val - min_val) / num_bins; + std::vector hist(num_bins); + + for (int i = 0; i < eigen_tensor.size(); i++) { + int bin = std::min( + num_bins - 1, + static_cast(floor((eigen_tensor[i] - min_val) / bin_width))); + ++hist[bin]; + } + + return std::make_pair(std::move(hist), std::move(bin_width)); +} + +void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const { + auto& arg = predictor_.argument_; + if (!arg.scope_valid()) arg.SetScope(new framework::Scope); + arg.SetMainProgramNotOwned(predictor_.inference_program_.get()); + auto graph = std::unique_ptr(new Graph(arg.main_program())); + arg.SetMainGraph(graph.release()); + arg.main_graph().Set(framework::ir::kParamScopeAttr, + new framework::Scope*(arg.scope_ptr())); + + auto* builder = predictor_.config_.pass_builder(); + builder->SetPasses({ + "infer_clean_graph_pass", "cpu_quantize_pass", "cpu_quantize_squash_pass", + }); + if (predictor_.config_.ir_debug_) builder->TurnOnDebug(); + auto passes = builder->AllPasses(); + predictor_.argument_.SetIrAnalysisPasses(passes); + predictor_.argument_.SetAnalysisPasses( + {"ir_analysis_pass", "memory_optimize_pass", "ir_graph_to_program_pass"}); + predictor_.argument_.SetQuantVarScales(scales_); +} + +bool AnalysisPredictor::MkldnnQuantizer::Quantize() { + if (!RunWarmup()) return false; + if (!CalculateScales()) return false; + predictor_.PrepareScope(predictor_.scope_); + predictor_.CreateExecutor(); + if (!RunQuantizePasses()) return false; + predictor_.PrepareExecutor(); + predictor_.PrepareFeedFetch(); + return true; +} + +bool AnalysisPredictor::MkldnnQuantizer::RunQuantizePasses() const { + predictor_.executor_->CreateVariables(*predictor_.inference_program_, 0, true, + predictor_.sub_scope_); + PrepareArgument(); + auto& arg = predictor_.argument_; + Analyzer().Run(&arg); + PADDLE_ENFORCE(arg.scope_valid()); + VLOG(5) << "to prepare executor"; + ARGUMENT_CHECK_FIELD((&arg), ir_analyzed_program); + predictor_.inference_program_.reset( + new framework::ProgramDesc(arg.ir_analyzed_program())); + LOG(INFO) << "== optimize 2 end =="; + predictor_.executor_->CreateVariables(*predictor_.inference_program_, 0, + false, predictor_.sub_scope_); + return true; +} + +bool AnalysisPredictor::MkldnnQuantizer::RunWarmup() const { + VLOG(3) << "Predictor: run a quantization warmup iteration"; + auto warmup_data = qconfig_->warmup_data(); + PADDLE_ENFORCE_NOT_NULL(warmup_data, + "Warmup data cannot be NULL in the config."); + PrettyLogH1("--- Running warmup iteration for quantization"); + + // Run the inference program + std::vector output_slots; + predictor_.Run(*warmup_data, &output_slots, qconfig_->warmup_batch_size()); + + return true; +} + +float AnalysisPredictor::MkldnnQuantizer::SafeEntropy( + std::vector reference_distr_P, int P_sum, + std::vector candidate_distr_Q, int Q_sum) const { + PADDLE_ENFORCE_EQ(reference_distr_P.size(), candidate_distr_Q.size()); + float tmp_sum1 = 0; + float tmp_sum2 = 0; + for (size_t idx = 0; idx < reference_distr_P.size(); idx++) { + int p_idx = reference_distr_P[idx]; + int q_idx = candidate_distr_Q[idx]; + if (p_idx == 0) { + tmp_sum1 += 0; + tmp_sum2 += 0; + } else { + PADDLE_ENFORCE(q_idx != 0, "MkldnnQuantizer: Fatal error!, idx = " + + std::to_string(idx) + + " qindex = 0! p_idx = " + + std::to_string(p_idx)); + } + tmp_sum1 += p_idx * (log(Q_sum * p_idx)); + tmp_sum2 += p_idx * (log(P_sum * q_idx)); + } + return (tmp_sum1 - tmp_sum2) / P_sum; +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.h b/paddle/fluid/inference/api/mkldnn_quantizer.h new file mode 100644 index 0000000000000000000000000000000000000000..f4b0df5d742ed12f856fc7982d955e89288a1888 --- /dev/null +++ b/paddle/fluid/inference/api/mkldnn_quantizer.h @@ -0,0 +1,104 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/string/printf.h" +#ifdef PADDLE_WITH_TESTING +#include +#include +#endif + +namespace paddle { + +/* + * Map variable name to tensor of scaling factors scaling it to MAX=1.0. + * bool denotes whether quantization of the variable should be done to unsigned + * type. + */ +using VarQuantScale = + std::unordered_map>; + +class AnalysisPredictor::MkldnnQuantizer { + public: + explicit MkldnnQuantizer( + AnalysisPredictor& predictor, // NOLINT + const std::shared_ptr& qconfig) + : predictor_(predictor), qconfig_(qconfig) {} + + // Execute full quantization procedure. + bool Quantize(); + +#if PADDLE_WITH_TESTING + friend class MkldnnQuantizerTest; +#endif + + private: + // Run single warmup iteration + bool RunWarmup() const; + // Gather data from variables and calculate scales for them. + bool CalculateScales(); + // Calculate a scale for tensor based on ScaleAlgo rules. + void CalculateSingleScale(const std::string& op_name, + const std::string& conn_name, + const std::string& var_name, + const framework::LoDTensor& var_tensor, + bool is_unsigned); + void PrepareArgument() const; + bool RunQuantizePasses() const; + + std::vector ExpandQuantizedBins(std::vector quantized_bins, + std::vector reference_bins) const; + + // Using the KL-divergence method get the most precise scaling factor. + std::pair GetKLScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const; + + std::pair GetMaxChScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const; + + std::pair GetMaxScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const; + + // Returns histogram and bin width + std::pair, float> Histogram( + const framework::LoDTensor& var_tensor, float min_val, float max_val, + size_t num_bins = 2048) const; + + // Calculate the entropy. + float SafeEntropy(std::vector reference_distr_P, int P_sum, + std::vector candidate_distr_Q, int Q_sum) const; + + private: + AnalysisPredictor& predictor_; + const std::shared_ptr qconfig_; + + // A map: variable name -> scale + VarQuantScale scales_; +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc new file mode 100644 index 0000000000000000000000000000000000000000..f9ff542d86d2a7a3ac2e7f004e11eddfea3598d5 --- /dev/null +++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h" + +namespace paddle { + +MkldnnQuantizerConfig::MkldnnQuantizerConfig() { + // The default configuration of scale computing algorightms + rules_["conv2d"]["Input"] = ScaleAlgo::KL; + rules_["conv2d"]["Filter"] = ScaleAlgo::MAX_CH; + rules_["conv2d"]["Bias"] = ScaleAlgo::NONE; // do not compute scale + rules_["conv2d"]["ResidualData"] = ScaleAlgo::KL; + rules_["conv2d"]["Output"] = ScaleAlgo::KL; // do not compute scale + + rules_["pool2d"]["X"] = ScaleAlgo::KL; + rules_["pool2d"]["Out"] = ScaleAlgo::KL; // do not compute scale +} + +ScaleAlgo MkldnnQuantizerConfig::scale_algo( + const std::string& op_type_name, const std::string& conn_name) const { + if (rules_.find(op_type_name) != rules_.end()) { + auto op_rule = rules_.at(op_type_name); + if (op_rule.find(conn_name) != op_rule.end()) return op_rule.at(conn_name); + } + return default_scale_algo_; +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index c1c6227cdd8b2042f6765c7932327ecae246c260..2ad4add2945d65037829e0bb453372e38a04421c 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -14,9 +14,11 @@ #pragma once #include +#include #include #include #include +#include #include /*! \file */ @@ -25,10 +27,14 @@ // the abstract path of this header file will be changed. #include "paddle_api.h" // NOLINT #include "paddle_pass_builder.h" // NOLINT +#ifdef PADDLE_WITH_MKLDNN +#include "paddle_mkldnn_quantizer_config.h" // NOLINT +#endif namespace paddle { class AnalysisPredictor; +struct MkldnnQuantizerConfig; // NOTE WIP, not stable yet. struct AnalysisConfig { @@ -135,10 +141,21 @@ struct AnalysisConfig { */ void EnableTensorRtEngine(int workspace_size = 1 << 20, int max_batch_size = 1, int min_subgraph_size = 3, - Precision precision = Precision::kFloat32); + Precision precision = Precision::kFloat32, + bool use_static = false); /** A boolean state telling whether the TensorRT engine is used. */ bool tensorrt_engine_enabled() const { return use_tensorrt_; } + /** + * \brief Turn on the usage of Anakin sub-graph engine. + */ + void EnableAnakinEngine( + int max_batch_size = 1, + std::map> max_input_shape = {}); + + /** A boolean state indicating whether the Anakin sub-graph engine is used. + */ + bool anakin_engine_enabled() const { return use_anakin_; } /** \brief Control whether to debug IR graph analysis phase. * @@ -173,6 +190,16 @@ struct AnalysisConfig { mkldnn_enabled_op_types_ = op_list; } + /** Turn on quantization. + */ + void EnableMkldnnQuantizer(); + + /** A boolean state telling whether the quantization is enabled. + */ + bool mkldnn_quantizer_enabled() const { return use_mkldnn_quantizer_; } + + std::shared_ptr mkldnn_quantizer_config() const; + /** Specify the memory buffer of program and parameter * @param prog_buffer the memory buffer of program. * @param prog_buffer_size the size of the data. @@ -184,6 +211,7 @@ struct AnalysisConfig { /** A boolean state telling whether the model is set from the CPU memory. */ bool model_from_memory() const { return model_from_memory_; } + void SetEngineOptInfo(std::map engine_opt_info); /** Turn on memory optimize * NOTE still in development, will release latter. @@ -233,6 +261,7 @@ struct AnalysisConfig { // subgraph, 3 as default value. int tensorrt_min_subgraph_size_{3}; Precision tensorrt_precision_mode_; + bool trt_use_static_engine_; // memory reuse related. bool enable_memory_optim_{false}; @@ -256,6 +285,14 @@ struct AnalysisConfig { std::string serialized_info_cache_; mutable std::unique_ptr pass_builder_; + + bool use_anakin_{false}; + int anakin_max_batchsize_; + std::map> anakin_max_input_shape_; + std::map engine_opt_info_; + + bool use_mkldnn_quantizer_{false}; + std::shared_ptr mkldnn_quantizer_config_; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index c9a45b4aa3b4037d3725622fc960848bc1ccfb2c..87f40f09eb9bb552bd246cb39bbbd41abac1c9ac 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -36,6 +36,7 @@ namespace paddle { enum PaddleDType { FLOAT32, INT64, + INT32, // TODO(Superjomn) support more data types if needed. }; @@ -160,11 +161,23 @@ class ZeroCopyTensor { template T* data(PaddlePlace* place, int* size) const; - std::vector shape() const; + template + void copy_from_cpu(const T* data); + + template + void copy_to_cpu(T* data); + + std::vector shape() const; void SetLoD(const std::vector>& x); std::vector> lod() const; const std::string& name() const { return name_; } + void SetPlace(PaddlePlace place, int device = -1) { + place_ = place; + device_ = device; + } + + PaddleDType type() const; protected: explicit ZeroCopyTensor(void* scope) : scope_{scope} {} @@ -179,6 +192,9 @@ class ZeroCopyTensor { // The corresponding tensor pointer inside Paddle workspace is cached for // performance. mutable void* tensor_{nullptr}; + PaddlePlace place_; + PaddleDType dtype_; + int device_; }; /** A simple Inference API for Paddle. @@ -200,6 +216,14 @@ class PaddlePredictor { std::vector* output_data, int batch_size = -1) = 0; + /** \brief Get input names of the model + */ + virtual std::vector GetInputNames() { return {}; } + + /** \brief Get output names of the model + */ + virtual std::vector GetOutputNames() { return {}; } + /** \brief Get a mutable tensor directly. * * NOTE Only works in AnalysisPredictor. diff --git a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h new file mode 100644 index 0000000000000000000000000000000000000000..d46f842de7a2277ee5d00672386b12af7ba28deb --- /dev/null +++ b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h @@ -0,0 +1,105 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle_api.h" // NOLINT + +namespace paddle { + +// Algorithms for finding scale of quantized Tensors. +enum class ScaleAlgo { + NONE, // Do not compute scale + MAX, // Find scale based on the maximum absolute value + MAX_CH, // Find scale based on the maximum absolute value per channel + KL, // Find scale based on KL Divergence +}; + +struct MkldnnQuantizerConfig { + MkldnnQuantizerConfig(); + + /** Specify a quantization algorithm for a connection (input/output) of the + * operator type. + * @param op_type_name the operator's name. + * @param conn_name name of the connection (input/output) of the operator. + * @param algo the algorithm for computing scale. + */ + void SetScaleAlgo(std::string op_type_name, std::string conn_name, + ScaleAlgo algo) { + rules_[op_type_name][conn_name] = algo; + } + + /** Get the quantization algorithm for a connection (input/output) of the + * operator type. + * @param op_type_name the operator's name. + * @param conn_name name of the connection (input/output) of the operator. + * @return the algorithm for computing scale. + */ + ScaleAlgo scale_algo(const std::string& op_type_name, + const std::string& conn_name) const; + + /** Set the batch of data to be used for warm-up iteration. + * @param data batch of data. + */ + void SetWarmupData(std::shared_ptr> data) { + warmup_data_ = data; + } + + /** Get the batch of data used for warm-up iteration. + * @return batch of data. + */ + std::shared_ptr> warmup_data() const { + return warmup_data_; + } + + void SetWarmupBatchSize(int batch_size) { warmup_bs_ = batch_size; } + + int warmup_batch_size() const { return warmup_bs_; } + + void SetEnabledOpTypes(std::unordered_set op_list) { + enabled_op_types_ = op_list; + } + + const std::unordered_set& enabled_op_types() const { + return enabled_op_types_; + } + + void SetExcludedOpIds(std::unordered_set op_ids_list) { + excluded_op_ids_ = op_ids_list; + } + + const std::unordered_set& excluded_op_ids() const { + return excluded_op_ids_; + } + + void SetDefaultScaleAlgo(ScaleAlgo algo) { default_scale_algo_ = algo; } + + ScaleAlgo default_scale_algo() const { return default_scale_algo_; } + + protected: + std::map> rules_; + std::unordered_set enabled_op_types_; + std::unordered_set excluded_op_ids_; + std::shared_ptr> warmup_data_; + int warmup_bs_{1}; + ScaleAlgo default_scale_algo_{ScaleAlgo::MAX}; +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 92c24647e87a096e7cfbbf69876b678fe48842a4..8ec32b3a0b7fe459518e269fc72b182bc168435f 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -68,10 +68,26 @@ void GpuPassStrategy::EnableMKLDNN() { LOG(ERROR) << "GPU not support MKLDNN yet"; } +// The following passes works for Anakin sub-graph engine. +const std::vector kAnakinSubgraphPasses({ + "infer_clean_graph_pass", // + "simplify_anakin_detection_pattern_pass5", // + "simplify_anakin_detection_pattern_pass4", // + "simplify_anakin_detection_pattern_pass3", // + "simplify_anakin_detection_pattern_pass2", // + "anakin_fillconstant_elementwisemul_fuse", // + "fc_fuse_pass", // + "conv_elementwise_add_fuse_pass", // + "conv_bn_fuse_pass", // + "conv_elementwise_add_fuse_pass", // + "fc_gru_fuse_pass", // + "anakin_subgraph_pass", +}); + GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { passes_.assign({ - "infer_clean_graph_pass", // - "identity_scale_op_clean_pass", // + "infer_clean_graph_pass", // + // "identity_scale_op_clean_pass", // "conv_affine_channel_fuse_pass", // "conv_eltwiseadd_affine_channel_fuse_pass", // "conv_bn_fuse_pass", // @@ -80,16 +96,21 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { "conv_elementwise_add_act_fuse_pass", // "conv_elementwise_add2_act_fuse_pass", // "conv_elementwise_add_fuse_pass", // + "runtime_context_cache_pass", // #endif }); - for (int i = 6; i >= 3; i--) { + for (int i = 6; i >= 2; i--) { passes_.push_back("transpose_flatten" + std::to_string(i) + "_concat_fuse_pass"); } use_gpu_ = true; } +void GpuPassStrategy::EnableMkldnnQuantizer() { + LOG(ERROR) << "GPU not support MKL-DNN quantization"; +} + void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { analysis_passes_.push_back(pass); } @@ -115,7 +136,9 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { "conv_eltwiseadd_bn_fuse_pass", // "is_test_pass", // "identity_scale_op_clean_pass", // + "runtime_context_cache_pass", // }); use_gpu_ = false; } +void PaddlePassBuilder::ClearPasses() { passes_.clear(); } } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 2524d89fcd1322e105ad2217347aa2380448f2bc..48da8c156f426477011bcc060260c812ad94df23 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -30,6 +30,10 @@ class PaddlePassBuilder { explicit PaddlePassBuilder(const std::vector &passes) : passes_(passes) {} + void SetPasses(std::initializer_list passes) { + passes_ = passes; + } + /** Append a pass to the end of the passes. */ void AppendPass(const std::string &pass_type); @@ -45,6 +49,7 @@ class PaddlePassBuilder { /** Delete all the passes that has type `pass_type`. */ void DeletePass(const std::string &pass_type); + void ClearPasses(); /** Append an analysis pass. */ void AppendAnalysisPass(const std::string &pass); @@ -84,6 +89,10 @@ class PassStrategy : public PaddlePassBuilder { */ virtual void EnableMKLDNN() {} + /** Enable MKLDNN quantize optimization + */ + virtual void EnableMkldnnQuantizer() {} + bool use_gpu() const { return use_gpu_; } virtual ~PassStrategy() = default; @@ -112,6 +121,8 @@ class CpuPassStrategy : public PassStrategy { for (auto &pass : std::vector( {"depthwise_conv_mkldnn_pass", // + "conv_bn_fuse_pass", // Execute BN passes again to + "conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order "conv_bias_mkldnn_fuse_pass", // "conv3d_bias_mkldnn_fuse_pass", // "conv_relu_mkldnn_fuse_pass", // @@ -124,6 +135,20 @@ class CpuPassStrategy : public PassStrategy { use_mkldnn_ = false; #endif } + + void EnableMkldnnQuantizer() override { +#ifdef PADDLE_WITH_MKLDNN + if (!use_mkldnn_quantizer_) { + passes_.push_back("cpu_quantize_placement_pass"); + } + use_mkldnn_quantizer_ = true; +#else + use_mkldnn_quantizer_ = false; +#endif + } + + protected: + bool use_mkldnn_quantizer_{false}; }; /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode. @@ -138,8 +163,11 @@ class GpuPassStrategy : public PassStrategy { } void EnableMKLDNN() override; + void EnableMkldnnQuantizer() override; virtual ~GpuPassStrategy() = default; }; +extern const std::vector kAnakinSubgraphPasses; + } // namespace paddle diff --git a/paddle/fluid/inference/engine.h b/paddle/fluid/inference/engine.h index ce2b8161715a3fa2278ce950dbac82c6d0042bef..1a13ba510384c010e476bf0ba0ad5b0ba84d3240 100644 --- a/paddle/fluid/inference/engine.h +++ b/paddle/fluid/inference/engine.h @@ -49,11 +49,6 @@ class EngineBase { // Execute the engine, that will run the inference network. virtual void Execute(int batch_size) = 0; - // Return the IO buffer that allocated in engine. One can read/write directly - // on the buffer. If the buffer's buffer is nullptr, one can also allocate - // memory and maintain it outside the engine. - virtual Buffer& buffer(const std::string& name) = 0; - virtual ~EngineBase() {} }; // class EngineBase diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 7900f56c9ce17ffc7c62c85a42c62ba326dea16e..39a99a21ea702032669ed4ed3016ab34128c9925 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -18,21 +18,6 @@ namespace paddle { namespace inference { namespace tensorrt { -bool to_skip_merging_optimize(TensorRTEngine* engine, - const std::vector& filters, - const std::vector& strides, - const std::vector& paddings, - std::string input_name) { - if (engine->itensor_quote_num[input_name] > 0) { - return true; - } - if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 && - strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0) - engine->itensor_quote_num[input_name] += 1; - - return false; -} - template void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode, @@ -59,7 +44,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, weight_tensor->Resize(Y_t->dims()); TensorCopySync((*Y_t), cpu_place, weight_tensor.get()); - auto* weight_data = weight_tensor->mutable_data(platform::CPUPlace()); + auto* weight_data = weight_tensor->mutable_data(cpu_place); PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); const int n_output = weight_tensor->dims()[0]; @@ -100,9 +85,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, layer->getOutput(0)->setName(output_name.c_str()); engine->SetITensor(output_name, layer->getOutput(0)); - if (test_mode || - to_skip_merging_optimize(engine, {filter_h, filter_w}, strides, paddings, - op_desc.Input("Input").front())) { + if (test_mode) { engine->DeclareOutput(output_name); } } diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 79362f9677010247dffa4fbaa155a7a56eed6f85..0c5a1a6ef16f05308df22452ed5e184e94e117d2 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -153,7 +153,6 @@ class ElementwiseTensorOpConverter : public OpConverter { if (CheckDims(dims_x, dims_y)) { // The two input tensor should have the same dims VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer"; - nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER( engine_, ElementWise, *const_cast(X), *const_cast(Y), op_pair->second); @@ -166,7 +165,7 @@ class ElementwiseTensorOpConverter : public OpConverter { "ElementWisePluginLayer"; plugin::ElementWisePlugin* plugin = - new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis); + new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis); plugin->AddInput(X); plugin->AddInput(Y); nvinfer1::IPluginLayer* layer = engine_->AddPlugin( diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index eef4fab4e86f05fa80bc614371f1aa43e433407e..42dcd68e40e04e775961fd943070f3df2f28d99a 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -85,10 +85,10 @@ class FcOpConverter : public OpConverter { Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float)); TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), - Y_t->memory_size() / sizeof(float)}; + static_cast(Y_t->numel())}; TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT, static_cast(tmp->data()), - Y_t->memory_size() / sizeof(float)); + static_cast(Y_t->numel())); weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]}); tmp_weight.dims = weight.dims; diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.h b/paddle/fluid/inference/tensorrt/convert/io_converter.h index 71c48e085d25d2bc6720d93735f661f9e3af7b40..5daa242f6ab802a50fa6105f0102b817b700f461 100644 --- a/paddle/fluid/inference/tensorrt/convert/io_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/io_converter.h @@ -45,7 +45,7 @@ class EngineIOConverter { static void ConvertInput(const std::string& op_type, const LoDTensor& in, void* out, size_t max_size, cudaStream_t* stream) { PADDLE_ENFORCE(stream != nullptr); - auto* converter = Registry::Lookup( + auto* converter = Registry::Global().Lookup( op_type, "default" /* default_type */); PADDLE_ENFORCE_NOT_NULL(converter); converter->SetStream(stream); @@ -56,7 +56,7 @@ class EngineIOConverter { LoDTensor* out, size_t max_size, cudaStream_t* stream) { PADDLE_ENFORCE(stream != nullptr); - auto* converter = Registry::Lookup( + auto* converter = Registry::Global().Lookup( op_type, "default" /* default_type */); PADDLE_ENFORCE_NOT_NULL(converter); converter->SetStream(stream); @@ -69,12 +69,12 @@ class EngineIOConverter { cudaStream_t* stream_{nullptr}; }; -#define REGISTER_TENSORRT_IO_CONVERTER(op_type__, Converter__) \ - struct trt_io_##op_type__##_converter { \ - trt_io_##op_type__##_converter() { \ - Registry::Register(#op_type__); \ - } \ - }; \ +#define REGISTER_TENSORRT_IO_CONVERTER(op_type__, Converter__) \ + struct trt_io_##op_type__##_converter { \ + trt_io_##op_type__##_converter() { \ + Registry::Global().Register(#op_type__); \ + } \ + }; \ trt_io_##op_type__##_converter trt_io_##op_type__##_converter__; } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 91670ba8ac5332fe6e83b7bff14cb1a349d7e2a2..55515569ead6e40c9b1b45fe31189dab7e2f2bb4 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -16,9 +16,12 @@ limitations under the License. */ #include #include +#include +#include #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -26,6 +29,37 @@ namespace paddle { namespace inference { namespace tensorrt { +using FluidDT = framework::proto::VarType_Type; +using TRT_DT = nvinfer1::DataType; + +namespace { // NOLINT + +TRT_DT FluidDataType2TRT(FluidDT type) { + switch (type) { + case FluidDT::VarType_Type_FP32: + return TRT_DT::kFLOAT; + case FluidDT::VarType_Type_INT32: + return TRT_DT::kINT32; + default: + return TRT_DT::kINT32; + } + PADDLE_THROW("unkown type"); + return TRT_DT::kINT32; +} + +nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape) { + PADDLE_ENFORCE_GT(shape.size(), 1UL, + "TensorRT' tensor input requires at least 2 dimensions"); + PADDLE_ENFORCE_LE(shape.size(), 4UL, + "TensorRT' tensor input requires at most 4 dimensions"); + PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL); + if (shape.size() == 4UL) + return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]); + return nvinfer1::DimsCHW(shape[1], 1, 1); +} + +} // namespace // NOLINT + /* * Convert Op from Fluid to TensorRT Engine. */ @@ -52,7 +86,7 @@ class OpConverter { PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL); std::string Y = op_desc.Input("Y")[0]; if (parameters.count(Y)) { - it = Registry::Lookup("fc"); + it = Registry::Global().Lookup("fc"); } } if (op_desc.Type().find("elementwise") != std::string::npos) { @@ -69,28 +103,28 @@ class OpConverter { if (parameters.count(Y)) { PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0, "Unsupported elementwise type" + op_type); - it = - Registry::Lookup("elementwise_" + op_type + "_weight"); + it = Registry::Global().Lookup("elementwise_" + op_type + + "_weight"); PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_desc.Type()); } else { PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0, "Unsupported elementwise type" + op_type); - it = - Registry::Lookup("elementwise_" + op_type + "_tensor"); + it = Registry::Global().Lookup("elementwise_" + op_type + + "_tensor"); } PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_desc.Type()); } if (op_desc.Type() == "depthwise_conv2d") { - it = Registry::Lookup("conv2d"); + it = Registry::Global().Lookup("conv2d"); PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_desc.Type()); } if (!it) { - it = Registry::Lookup(op_desc.Type()); + it = Registry::Global().Lookup(op_desc.Type()); } PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_desc.Type()); @@ -110,6 +144,34 @@ class OpConverter { } } + // The scope here should be inited with the parameter vars. + void ConvertBlockToTRTEngine( + framework::BlockDesc* block_desc, const framework::Scope& scope, + const std::vector& inputs, + const std::unordered_set& parameters, + const std::vector& outputs, TensorRTEngine* engine) { + engine->InitNetwork(); + for (auto& input : inputs) { + if (parameters.count(input)) continue; + auto* var = block_desc->FindVar(input); + PADDLE_ENFORCE(var, "no variable called %s", input); + PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, + "TensorRT engine only takes LoDTensor as input"); + auto var_shape = var->GetShape(); + + engine->DeclareInput( + input, FluidDataType2TRT( + var->Proto()->type().lod_tensor().tensor().data_type()), + Vec2TRT_Dims(var_shape)); + } + framework::proto::BlockDesc* block_proto = block_desc->Proto(); + ConvertBlock(*block_proto, parameters, scope, engine); + for (auto& output : outputs) { + engine->DeclareOutput(output); + } + engine->FreezeNetwork(); + } + void SetEngine(TensorRTEngine* engine) { engine_ = engine; } virtual ~OpConverter() {} @@ -136,9 +198,9 @@ class OpConverter { #define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \ struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \ trt_##op_type__##_converter() { \ - ::paddle::inference:: \ - Registry::Register< \ - ::paddle::inference::tensorrt::Converter__>(#op_type__); \ + ::paddle::inference::Registry< \ + paddle::inference::tensorrt::OpConverter>::Global() \ + .Register<::paddle::inference::tensorrt::Converter__>(#op_type__); \ } \ }; \ trt_##op_type__##_converter trt_##op_type__##_converter__; \ diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc index dbdff85ddebc85bc51938a204a48affe485b8240..2ae804106e5f7b51fc43e33cad986619e6a57d74 100644 --- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc @@ -43,23 +43,20 @@ class PReluOpConverter : public OpConverter { PADDLE_ENFORCE_NOT_NULL(alpha_var); auto* alpha_tensor = alpha_var->GetMutable(); - platform::CUDAPlace place; - std::unique_ptr alpha_tensor_device( + platform::CPUPlace cpu_place; + std::unique_ptr alpha_tensor_temp( new framework::LoDTensor()); - alpha_tensor_device->Resize(alpha_tensor->dims()); - TensorCopySync(*alpha_tensor, place, alpha_tensor_device.get()); - float* alpha_data = alpha_tensor_device->mutable_data(place); + alpha_tensor_temp->Resize(alpha_tensor->dims()); + TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get()); + float* alpha_data = alpha_tensor_temp->mutable_data(cpu_place); - // Transform alpha to TensorRTEngine::Weight - TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT, - static_cast(alpha_data), - alpha_tensor_device->numel()); - plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode); + plugin::PReluPlugin* plugin = + new plugin::PReluPlugin(alpha_data, alpha_tensor_temp->numel(), mode); nvinfer1::IPluginLayer* layer = engine_->AddPlugin(&input, input_num, plugin); // keep alpha tensor to avoid release it's memory engine_->weight_map[op_desc.Input("Alpha")[0]] = - std::move(alpha_tensor_device); + std::move(alpha_tensor_temp); std::string layer_name = "prelu (Output: "; auto output_name = op_desc.Output("Out")[0]; diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index e83961f3d7bda03a7659f175c59105dcb60708e9..2571abbf69892dae626c7178609c2825775fdf2e 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -19,7 +19,9 @@ limitations under the License. */ #pragma once +#include #include +#include #include #include "paddle/fluid/framework/lod_tensor.h" @@ -79,7 +81,8 @@ class TRTConvertValidation { if_add_batch_(if_add_batch), max_batch_size_(max_batch_size) { PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); - engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_)); + engine_.reset( + new TensorRTEngine(max_batch_size, workspace_size, false, nullptr, 0)); engine_->InitNetwork(); } @@ -114,13 +117,12 @@ class TRTConvertValidation { } void DeclVar(const std::string& name, const std::vector dim_vec) { - platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + platform::CUDADeviceContext ctx(place_); auto* x = scope_.Var(name); auto* x_tensor = x->GetMutable(); x_tensor->Resize(framework::make_ddim(dim_vec)); - RandomizeTensor(x_tensor, place, ctx); + RandomizeTensor(x_tensor, place_, ctx); } // Declare a variable in a fluid Scope. void DeclVar(const std::string& name, const nvinfer1::Dims& dims, @@ -146,19 +148,6 @@ class TRTConvertValidation { // Declare outputs. op_desc_.reset(new framework::OpDesc(desc, nullptr)); - - // Set Inputs. - for (const auto& input : op_desc_->InputArgumentNames()) { - if (parameters_.count(input)) continue; - auto* var = scope_.FindVar(input); - PADDLE_ENFORCE(var); - auto tensor = var->GetMutable(); - - engine_->SetInputFromGPU( - input, static_cast(tensor->data()), - sizeof(float) * - analysis::AccuDims(tensor->dims(), tensor->dims().size())); - } } // We use the set 'neglected_output' here, because some Ops like batch norm, @@ -168,43 +157,71 @@ class TRTConvertValidation { std::unordered_set neglected_output = {}) { // Execute Fluid Op PADDLE_ENFORCE_LE(batch_size, max_batch_size_); - platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); - op_->Run(scope_, place); - // Execute TRT. - engine_->Execute(batch_size); - cudaStreamSynchronize(engine_->stream()); + platform::CUDADeviceContext ctx(place_); + op_->Run(scope_, place_); - ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); - const size_t output_space_size = 3000; + std::vector input_output_names; + + // Note: we need filter the parameter + for (const auto& input : op_desc_->InputArgumentNames()) { + if (parameters_.count(input)) continue; + input_output_names.push_back(input); + } + + // Collect the fluid outputs. + std::vector> fluid_outs; for (const auto& output : op_desc_->OutputArgumentNames()) { if (neglected_output.count(output)) continue; + input_output_names.push_back(output); std::vector fluid_out; - std::vector trt_out(output_space_size); - engine_->GetOutputInCPU(output, &trt_out[0], output_space_size); - cudaStreamSynchronize(engine_->stream()); - auto* var = scope_.FindVar(output); - auto tensor = var->GetMutable(); + auto* tensor = var->GetMutable(); framework::TensorToVector(*tensor, ctx, &fluid_out); + fluid_outs.push_back(fluid_out); + } + + // Bind input and output for TRT. + const int num_bindings = input_output_names.size(); + std::vector buffers(num_bindings); + + for (const std::string& name : input_output_names) { + auto* var = scope_.FindVar(name); + auto* tensor = var->GetMutable(); + const int bind_index = engine_->engine()->getBindingIndex(name.c_str()); + buffers[bind_index] = + static_cast(tensor->mutable_data(place_)); + } + + // Execute TRT. + engine_->Execute(batch_size, &buffers, stream_); - size_t fluid_out_size = fluid_out.size(); + ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); + int index = 0; + for (const auto& output : op_desc_->OutputArgumentNames()) { + if (neglected_output.count(output)) continue; + std::vector trt_out; + auto* var = scope_.FindVar(output); + auto* tensor = var->GetMutable(); + framework::TensorToVector(*tensor, ctx, &trt_out); + + size_t fluid_out_size = fluid_outs[index].size(); if (if_add_batch_ == true) { fluid_out_size = batch_size * (framework::product(tensor->dims()) / max_batch_size_); } - // Compare two output - ASSERT_FALSE(fluid_out.empty()); + for (size_t i = 0; i < fluid_out_size; i++) { // Loose the threshold for CI in different machine model. - EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5); + EXPECT_LT(std::abs(fluid_outs[index][i] - trt_out[i]), 2e-5); } + index += 1; } } framework::Scope& scope() { return scope_; } private: + platform::CUDAPlace place_; std::unique_ptr engine_; cudaStream_t stream_; std::unique_ptr op_; diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 10f48462cfaf8073a4f5537d654d614d36b74db4..fddf5f11c285da4687b08d1962b6f1f51390e03e 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -32,36 +32,18 @@ void TensorRTEngine::Build(const DescType &paddle_model) { PADDLE_ENFORCE(false, "not implemented"); } -void TensorRTEngine::Execute(int batch_size) { +void TensorRTEngine::Execute(int batch_size, std::vector *buffers, + cudaStream_t stream) { freshDeviceId(); batch_size_ = batch_size; - std::vector buffers; - for (auto &buf : buffers_) { - PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated"); - PADDLE_ENFORCE_GT(buf.max_size, 0); - PADDLE_ENFORCE(buf.device == DeviceType::GPU); - buffers.push_back(buf.buffer); - } - infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr); - cudaStreamSynchronize(stream_); + infer_context_->enqueue(batch_size, buffers->data(), stream, nullptr); + cudaStreamSynchronize(stream); SetRuntimeBatch(batch_size); } -TensorRTEngine::~TensorRTEngine() { - cudaStreamSynchronize(stream_); - // clean buffer - for (auto &buf : buffers_) { - if (buf.device == DeviceType::GPU && buf.buffer != nullptr) { - PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer)); - buf.buffer = nullptr; - buf.max_size = 0; - } - } -} - void TensorRTEngine::FreezeNetwork() { - VLOG(3) << "TRT to freeze network"; freshDeviceId(); + VLOG(3) << "TRT to freeze network"; PADDLE_ENFORCE(infer_builder_ != nullptr, "Call InitNetwork first to initialize network."); PADDLE_ENFORCE(infer_network_ != nullptr, @@ -81,30 +63,6 @@ void TensorRTEngine::FreezeNetwork() { PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!"); infer_context_.reset(infer_engine_->createExecutionContext()); - - // allocate GPU buffers. - buffers_.resize(buffer_sizes_.size()); - for (auto &item : buffer_sizes_) { - // The output buffers are not set in the network building phrase, need to - // infer from the TesorRT network. - if (item.second == 0) { - auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str()); - auto dims = infer_engine_->getBindingDimensions(slot_offset); - item.second = kDataTypeSize[static_cast( - infer_engine_->getBindingDataType(slot_offset))] * - analysis::AccuDims(dims.d, dims.nbDims) * max_batch_; - PADDLE_ENFORCE_GT(item.second, 0); - } - - auto &buf = buffer(item.first); - buf.max_size = item.second * max_batch_; - CHECK(buf.buffer == nullptr); // buffer should be allocated only once. - - PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_)); - buf.size = 0; - PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G - buf.device = DeviceType::GPU; - } } nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, @@ -158,83 +116,6 @@ void TensorRTEngine::DeclareOutput(const std::string &name) { buffer_sizes_[name] = 0; } -void *TensorRTEngine::GetOutputInGPU(const std::string &name) { - return buffer(name).buffer; -} - -void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst, - size_t max_size) { - // determine data size - auto *output = TensorRTEngine::GetITensor(name); - nvinfer1::Dims dims = output->getDimensions(); - auto dim_size = analysis::AccuDims(dims.d, dims.nbDims); - size_t dst_size = dim_size * runtime_batch_ * - kDataTypeSize[static_cast(output->getType())]; - - auto it = buffer_sizes_.find(name); - PADDLE_ENFORCE(it != buffer_sizes_.end()); - PADDLE_ENFORCE_GT(it->second, 0); - PADDLE_ENFORCE_LE(dst_size, it->second); - PADDLE_ENFORCE_GE(max_size, dst_size); - auto &buf = buffer(name); - PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); - PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size, - cudaMemcpyDeviceToDevice, stream_), - 0); -} - -void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst, - size_t max_size) { - // determine data size - - auto *output = TensorRTEngine::GetITensor(name); - nvinfer1::Dims dims = output->getDimensions(); - auto dim_size = analysis::AccuDims(dims.d, dims.nbDims); - size_t dst_size = dim_size * runtime_batch_ * - kDataTypeSize[static_cast(output->getType())]; - auto it = buffer_sizes_.find(name); - PADDLE_ENFORCE(it != buffer_sizes_.end()); - PADDLE_ENFORCE_GT(it->second, 0); - PADDLE_ENFORCE_LE(dst_size, it->second); - PADDLE_ENFORCE_GE(max_size, dst_size); - auto &buf = buffer(name); - PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); - PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size, - cudaMemcpyDeviceToHost, stream_)); -} - -Buffer &TensorRTEngine::buffer(const std::string &name) { - PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first."); - auto it = buffer_sizes_.find(name); - PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s", - name); - auto slot_offset = infer_engine_->getBindingIndex(name.c_str()); - return buffers_[slot_offset]; -} - -void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data, - size_t size) { - auto &buf = buffer(name); - PADDLE_ENFORCE_NOT_NULL(buf.buffer); - PADDLE_ENFORCE_NOT_NULL(data); - PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); - PADDLE_ENFORCE(buf.device == DeviceType::GPU); - buf.size = size; - PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, - cudaMemcpyHostToDevice, stream_)); -} - -void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data, - size_t size) { - auto &buf = buffer(name); - buf.size = size; - PADDLE_ENFORCE_NOT_NULL(buf.buffer); - PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); - PADDLE_ENFORCE(buf.device == DeviceType::GPU); - PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, - cudaMemcpyDeviceToDevice, stream_)); -} - void TensorRTEngine::SetITensor(const std::string &name, nvinfer1::ITensor *tensor) { PADDLE_ENFORCE(tensor != nullptr); @@ -254,13 +135,6 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; } -void TensorRTEngine::freshDeviceId() { - int count; - cudaGetDeviceCount(&count); - PADDLE_ENFORCE_LT(device_, count); - cudaSetDevice(device_); -} - nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( nvinfer1::ITensor *const *inputs, int num_inputs, plugin::PluginTensorRT *plugin) { @@ -268,6 +142,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin); } +void TensorRTEngine::freshDeviceId() { + int count; + cudaGetDeviceCount(&count); + PADDLE_ENFORCE_LT(device_id_, count); + cudaSetDevice(device_id_); +} + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index cdfe09b5a7fd2d1f8548dab9421f671f5a345153..657dfd9355f9e3167a123b1f71655869d030a3df 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -37,7 +38,9 @@ class TRTInt8Calibrator; * There are two alternative ways to use it, one is to build from a paddle * protobuf model, another way is to manully construct the network. */ -class TensorRTEngine : public EngineBase { +class TensorRTEngine { + using DescType = ::paddle::framework::proto::BlockDesc; + public: // Weight is model parameter. class Weight { @@ -56,28 +59,28 @@ class TensorRTEngine : public EngineBase { nvinfer1::Weights w_; }; - TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream, - int device = 0, bool enable_int8 = false, - TRTInt8Calibrator* calibrator = nullptr, + TensorRTEngine(int max_batch, int max_workspace, bool enable_int8 = false, + TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), - stream_(stream), - device_(device), enable_int8_(enable_int8), calibrator_(calibrator), + device_id_(device_id), logger_(logger) {} - virtual ~TensorRTEngine(); + ~TensorRTEngine() {} // TODO(Superjomn) implement it later when graph segmentation is supported. - void Build(const DescType& paddle_model) override; + void Build(const DescType& paddle_model); - void Execute(int batch_size) override; + void Execute(int batch_size, std::vector* buffers, + cudaStream_t stream); // Initialize the inference network, so that TensorRT layers can add to this // network. void InitNetwork() { + freshDeviceId(); infer_builder_.reset(createInferBuilder(&logger_)); infer_network_.reset(infer_builder_->createNetwork()); } @@ -98,37 +101,34 @@ class TensorRTEngine : public EngineBase { // Check if the ITensor has been declared bool HasDeclared(const std::string& name); - // GPU memory address for an ITensor with specific name. One can operate on - // these memory directly for acceleration, for example, output the converted - // data directly to the buffer to save data copy overhead. - // NOTE this should be used after calling `FreezeNetwork`. - Buffer& buffer(const std::string& name) override; - - cudaStream_t stream() { return stream_; } - - // Fill an input from CPU memory with name and size. - void SetInputFromCPU(const std::string& name, const void* data, size_t size); - // TODO(Superjomn) is this method necessary given that buffer(xxx) can be - // accessed directly. Fill an input from GPU memory with name and size. - void SetInputFromGPU(const std::string& name, const void* data, size_t size); - // Get an output called name, the output of tensorrt is in GPU, so this method - // Return the output's GPU memory address without copy. - void* GetOutputInGPU(const std::string& name); - // Copy data into dst inside the GPU device. - void GetOutputInGPU(const std::string& name, void* dst, size_t max_size); - // LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU - // to CPU. - void GetOutputInCPU(const std::string& name, void* dst, size_t max_size); - // Fill an ITensor into map itensor_map_. void SetITensor(const std::string& name, nvinfer1::ITensor* tensor); // Get an ITensor called name. nvinfer1::ITensor* GetITensor(const std::string& name); nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } + + nvinfer1::IHostMemory* Serialize() { + PADDLE_ENFORCE(infer_engine_ != nullptr, + "You should build engine first and then serialize"); + ihost_memory_.reset(infer_engine_->serialize()); + return ihost_memory_.get(); + } + + void Deserialize(const std::string& engine_serialized_data) { + freshDeviceId(); + infer_ptr runtime(createInferRuntime(&logger_)); + infer_engine_.reset(runtime->deserializeCudaEngine( + engine_serialized_data.c_str(), engine_serialized_data.size(), + &inference::Singleton::Global())); + PADDLE_ENFORCE(infer_engine_ != nullptr, + "build cuda engine failed when deserialize engine info.!"); + infer_context_.reset(infer_engine_->createExecutionContext()); + } + void SetRuntimeBatch(size_t batch_size); int GetRuntimeBatch(); - int GetDevice() { return device_; } + int GetDeviceId() { return device_id_; } nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, int num_inputs, plugin::PluginTensorRT*); @@ -140,17 +140,12 @@ class TensorRTEngine : public EngineBase { std::unordered_map> weight_map; - // TODO(NHZLX) - // In the normal case, the paddle-trt exists bug when runing the googlenet. - // When there are more than two convolutions of 1 * 1 with the same input, the - // paddle-tensorrt will do the merging optimization, which fuse those conv - // into one conv, and then trigger bug. So, We should use strategy to avoid - // this - // optimization for the time being. This bug will be fixed in the future. - std::unordered_map - itensor_quote_num; - private: + // Each ICudaEngine object is bound to a specific GPU when it is instantiated, + // ensure that the thread is associated with the correct device by calling + // freshDeviceId(). + void freshDeviceId(); + // the max batch size int max_batch_; // the runtime batch size @@ -158,18 +153,14 @@ class TensorRTEngine : public EngineBase { // the max memory size the engine uses int max_workspace_; - cudaStream_t stream_; - // The specific GPU id that the TensorRTEngine bounded to. - int device_; - bool enable_int8_; TRTInt8Calibrator* calibrator_; // batch size of the current data, will be updated each Executation. int batch_size_{-1}; + int device_id_; nvinfer1::ILogger& logger_; - std::vector buffers_; // max data size for the buffers. std::unordered_map buffer_sizes_; std::unordered_map @@ -192,15 +183,11 @@ class TensorRTEngine : public EngineBase { infer_ptr infer_network_; infer_ptr infer_engine_; infer_ptr infer_context_; - // Each ICudaEngine object is bound to a specific GPU when it is instantiated, - // ensure that the thread is associated with the correct device by calling - // freshDeviceId(). - void freshDeviceId(); + infer_ptr ihost_memory_; }; // class TensorRTEngine // Add an layer__ into engine__ with args ARGS. // For example: -// TRT_ENGINE_ADD_LAYER(xxx, FullyConnected, input, dim, weights, bias) // // Reference // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index fc7ca7714e9325d2b6bce6189300aa339c81c2ba..010942a0678fe9a592d1a95ba9cdc6adc42cc2ec 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -17,6 +17,9 @@ #include #include #include +#include +#include +#include #include "paddle/fluid/platform/dynload/tensorrt.h" #include "paddle/fluid/platform/enforce.h" @@ -74,6 +77,32 @@ class NaiveLogger : public nvinfer1::ILogger { ~NaiveLogger() override {} }; +class NaiveProfiler : public nvinfer1::IProfiler { + public: + typedef std::pair Record; + std::vector mProfile; + + virtual void reportLayerTime(const char* layerName, float ms) { + auto record = + std::find_if(mProfile.begin(), mProfile.end(), + [&](const Record& r) { return r.first == layerName; }); + if (record == mProfile.end()) + mProfile.push_back(std::make_pair(layerName, ms)); + else + record->second += ms; + } + + void printLayerTimes() { + float totalTime = 0; + for (size_t i = 0; i < mProfile.size(); i++) { + printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), + mProfile[i].second); + totalTime += mProfile[i].second; + } + printf("Time over all layers: %4.3f\n", totalTime); + } +}; + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 95443e813327c1247ac530c4d2e68b3607ff0e73..709aa103d1b6681221328b180d65e90f08d3368e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1,4 +1,5 @@ nv_library(tensorrt_plugin - SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu + SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu + prelu_op_plugin.cu trt_plugin_factory.cc avg_pool_op_plugin.cu DEPS enforce tensorrt_engine prelu) diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu index 5d747af8c55d71fee90ee0cc06fd328e583f3700..f27a838162c89b6377a7ffd995608b3a5a49eeae 100644 --- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/operators/math/pooling.h" namespace paddle { @@ -20,6 +21,12 @@ namespace inference { namespace tensorrt { namespace plugin { +AvgPoolPlugin* CreateAvgPoolPluginDeserialize(const void* buffer, + size_t length) { + return new AvgPoolPlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("avg_pool_plugin", CreateAvgPoolPluginDeserialize); + nvinfer1::Dims AvgPoolPlugin::getOutputDimensions( int index, const nvinfer1::Dims* inputDims, int nbInputs) { assert(nbInputs == 1); diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h index b5e4ece0fba446627d619df6fe225e8c07231487..a7c0aa5794e6bb131d012cb12d6d9fc12a73bd0d 100644 --- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h @@ -33,24 +33,27 @@ class AvgPoolPlugin : public PluginTensorRT { protected: size_t getSerializationSize() override { - return SerializedSize(ceil_mode_) + SerializedSize(ksize_) + - SerializedSize(strides_) + SerializedSize(paddings_) + - SerializedSize(input_shape_) + getBaseSerializationSize(); + return SerializedSize(getPluginType()) + SerializedSize(ceil_mode_) + + SerializedSize(ksize_) + SerializedSize(strides_) + + SerializedSize(paddings_) + SerializedSize(input_shape_) + + SerializedSize(output_shape_) + getBaseSerializationSize(); } // TRT will call this func when we need to serialize the configuration of // tensorrt. - // It should not be called by users. void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); serializeBase(buffer); SerializeValue(&buffer, ceil_mode_); SerializeValue(&buffer, ksize_); SerializeValue(&buffer, strides_); SerializeValue(&buffer, paddings_); SerializeValue(&buffer, input_shape_); + SerializeValue(&buffer, output_shape_); } public: + AvgPoolPlugin() {} AvgPoolPlugin(bool ceil_mode, std::vector ksize, std::vector strides, std::vector paddings, std::vector input_shape) @@ -89,6 +92,7 @@ class AvgPoolPlugin : public PluginTensorRT { DeserializeValue(&serialData, &serialLength, &strides_); DeserializeValue(&serialData, &serialLength, &paddings_); DeserializeValue(&serialData, &serialLength, &input_shape_); + DeserializeValue(&serialData, &serialLength, &output_shape_); } AvgPoolPlugin *clone() const override { @@ -96,7 +100,7 @@ class AvgPoolPlugin : public PluginTensorRT { input_shape_); } - const char *getPluginType() const override { return "avg_pool"; } + const char *getPluginType() const override { return "avg_pool_plugin"; } int getNbOutputs() const override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, int nbInputDims) override; diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu index 9cd9026b7328083389b5af484bbb15c07b4908b0..9aed3ddab1448fde7cb6b0e13bcf0b05e23622e9 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu @@ -14,12 +14,19 @@ limitations under the License. */ #include #include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" namespace paddle { namespace inference { namespace tensorrt { namespace plugin { +ElementWisePlugin* CreateElementWisePluginDeserialize(const void* buffer, + size_t length) { + return new ElementWisePlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("elementwise_plugin", CreateElementWisePluginDeserialize); + namespace details { template @@ -119,10 +126,10 @@ int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs, const float* y = reinterpret_cast(inputs[1]); float* out = reinterpret_cast(outputs[0]); - if (type_ == nvinfer1::ElementWiseOperation::kSUM) { + if (type_ == "add") { details::ElementWise(details::Add(), x, y, out, batch_size, prev_size_, midd_size_, post_size_, stream); - } else if (type_ == nvinfer1::ElementWiseOperation::kPROD) { + } else if (type_ == "mul") { details::ElementWise(details::Mul(), x, y, out, batch_size, prev_size_, midd_size_, post_size_, stream); } else { diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h index 9c461f7a5c44ebb9d4a755288c69abff55e2dea8..3b040f14c531c540b8a855da85ecc3008224526c 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -24,9 +25,8 @@ namespace plugin { class ElementWisePlugin : public PluginTensorRT { public: - ElementWisePlugin(nvinfer1::ElementWiseOperation type, - nvinfer1::Dims const &dims_x, nvinfer1::Dims const &dims_y, - int axis) + ElementWisePlugin(std::string type, nvinfer1::Dims const &dims_x, + nvinfer1::Dims const &dims_y, int axis) : type_(type), dims_x_(dims_x), dims_y_(dims_y), @@ -37,6 +37,9 @@ class ElementWisePlugin : public PluginTensorRT { ElementWisePlugin(void const *serial_data, size_t serial_length) { deserializeBase(serial_data, serial_length); + const char *elementwise_type; + DeserializeValue(&serial_data, &serial_length, &elementwise_type); + type_ = std::string(elementwise_type); DeserializeValue(&serial_data, &serial_length, &axis_); DeserializeValue(&serial_data, &serial_length, &dims_x_); DeserializeValue(&serial_data, &serial_length, &dims_y_); @@ -47,7 +50,7 @@ class ElementWisePlugin : public PluginTensorRT { return nullptr; } - const char *getPluginType() const override { return "elementwise"; } + const char *getPluginType() const override { return "elementwise_plugin"; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *input_dims, @@ -61,18 +64,21 @@ class ElementWisePlugin : public PluginTensorRT { protected: size_t getSerializationSize() override { - return SerializedSize(axis_) + SerializedSize(dims_x_) + - SerializedSize(dims_y_) + getBaseSerializationSize(); + return SerializedSize(getPluginType()) + SerializedSize(axis_) + + SerializedSize(dims_x_) + SerializedSize(dims_y_) + + getBaseSerializationSize(); } void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); serializeBase(buffer); + SerializeValue(&buffer, type_.c_str()); SerializeValue(&buffer, axis_); SerializeValue(&buffer, dims_x_); SerializeValue(&buffer, dims_y_); } - nvinfer1::ElementWiseOperation type_; + std::string type_; nvinfer1::Dims dims_x_; nvinfer1::Dims dims_y_; int axis_; diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu index 3075e87ea6d719a3f49d14c8c4b8015f7d688a50..b8a044fe99b91893c8c9ef661b4f46ebaa6db8c7 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu @@ -17,6 +17,7 @@ #include #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/operators/math/prelu.h" namespace paddle { @@ -24,6 +25,17 @@ namespace inference { namespace tensorrt { namespace plugin { +PReluPlugin *CreatePreluPluginDeserialize(const void *buffer, size_t length) { + return new PReluPlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("prelu_plugin", CreatePreluPluginDeserialize); + +int PReluPlugin::initialize() { + cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size()); + cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float), + cudaMemcpyHostToDevice); +} + nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, const nvinfer1::Dims *inputDims, int nbInputs) { @@ -39,7 +51,8 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs, // input dims is CHW. const auto &input_dims = this->getInputDims(0); const float *input = reinterpret_cast(inputs[0]); - const float *alpha = reinterpret_cast(alpha_.get().values); + // const float *alpha = reinterpret_cast(alpha_.get().values); + const float *alpha = p_gpu_weight_; float *output = reinterpret_cast(outputs)[0]; std::vector input_shape; diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h index 0db56a310b072e64425f70ac23267ec72353e54b..a96649503f1c764e07370cb2b47b10f3dae72be4 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h @@ -14,7 +14,12 @@ #pragma once +#include #include +#include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" + #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -24,39 +29,51 @@ namespace tensorrt { namespace plugin { class PReluPlugin : public PluginTensorRT { - TensorRTEngine::Weight alpha_; + std::vector weight_; + float *p_gpu_weight_; std::string mode_; protected: size_t getSerializationSize() override { - // return getBaseSerializationSize(alpha_) + SerializedSize(mode_); - return 0; + return getBaseSerializationSize() + SerializedSize(mode_.c_str()) + + SerializedSize(weight_) + SerializedSize(getPluginType()); } // TRT will call this func when we need to serialize the configuration of // tensorrt. // It should not be called by users. void serialize(void *buffer) override { - // serializeBase(buffer); - // SerializeValue(&buffer, alpha_); - // SerializeValue(&buffer, mode_); + SerializeValue(&buffer, getPluginType()); + serializeBase(buffer); + SerializeValue(&buffer, weight_); + SerializeValue(&buffer, mode_.c_str()); } public: - PReluPlugin(TensorRTEngine::Weight const &alpha, std::string const &mode) - : alpha_(alpha), mode_(mode) {} + PReluPlugin(const float *weight, const int weight_num, + std::string const &mode) + : mode_(mode) { + weight_.resize(weight_num); + std::copy(weight, weight + weight_num, weight_.data()); + } // It was used for tensorrt deserialization. // It should not be called by users. PReluPlugin(void const *serialData, size_t serialLength) { - // deserializeBase(serialData, serialLength); - // DeserializeValue(&serialData, &serialLength, &alpha_); - // DeserializeValue(&serialData, &serialLength, &mode_); + deserializeBase(serialData, serialLength); + DeserializeValue(&serialData, &serialLength, &weight_); + const char *prelu_mode; + DeserializeValue(&serialData, &serialLength, &prelu_mode); + mode_ = std::string(prelu_mode); } + ~PReluPlugin() { cudaFree(p_gpu_weight_); } + int initialize() override; - PReluPlugin *clone() const override { return new PReluPlugin(alpha_, mode_); } + PReluPlugin *clone() const override { + return new PReluPlugin(weight_.data(), weight_.size(), mode_); + } - const char *getPluginType() const override { return "prelu"; } + const char *getPluginType() const override { return "prelu_plugin"; } int getNbOutputs() const override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, int nbInputDims) override; diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index de61ace59e299a1f51940e4b433a0133d4fbe7ff..b5503c3b95ee2429dd865fd6de416a04aafbccf0 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -15,12 +15,18 @@ #include #include #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" namespace paddle { namespace inference { namespace tensorrt { namespace plugin { +SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) { + return new SplitPlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize); + // copied from operators::math::SplitFunctor template __global__ void SplitKernel(const T* input_data, const int in_row, diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index 6f028d3d72ae3cc7d96c6782b734cdbf1243c06c..cbb72590567a35bee29387d4c00518b437913508 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -25,6 +26,7 @@ namespace plugin { class SplitPlugin : public PluginTensorRT { public: + SplitPlugin() {} SplitPlugin(int axis, std::vector const &output_lengths) : axis_(axis), same_shape_(true), output_length_(output_lengths) {} @@ -38,7 +40,7 @@ class SplitPlugin : public PluginTensorRT { return new SplitPlugin(axis_, output_length_); } - const char *getPluginType() const override { return "split"; } + const char *getPluginType() const override { return "split_plugin"; } int getNbOutputs() const override { return output_length_.size(); } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *input_dims, @@ -50,11 +52,12 @@ class SplitPlugin : public PluginTensorRT { protected: size_t getSerializationSize() override { - return SerializedSize(axis_) + SerializedSize(output_length_) + - getBaseSerializationSize(); + return SerializedSize(getPluginType()) + SerializedSize(axis_) + + SerializedSize(output_length_) + getBaseSerializationSize(); } void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); serializeBase(buffer); SerializeValue(&buffer, axis_); SerializeValue(&buffer, output_length_); diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h index 86084829e150f8a39610319a8f2138f2b2fdec68..3b737bd726ad09637f8530a114362d98d1dac1b0 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -17,9 +17,10 @@ #include #include #include +#include #include -#include "paddle/fluid/inference/tensorrt/plugin/serialize.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" @@ -30,6 +31,13 @@ namespace inference { namespace tensorrt { namespace plugin { +class PluginTensorRT; + +typedef std::function + PluginDeserializeFunc; + +typedef std::function PluginConstructFunc; + class PluginTensorRT : public nvinfer1::IPluginExt { public: PluginTensorRT() {} diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c20b6d1e725273dbfdc20c01fb01deea4e8d88e --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name, + const void* serial_data, + size_t serial_length) { + const char* plugin_type; + DeserializeValue(&serial_data, &serial_length, &plugin_type); + + PADDLE_ENFORCE(Has(plugin_type), + "trt plugin type %s does not exists, check it.", plugin_type); + auto plugin = plugin_registry_[plugin_type](serial_data, serial_length); + owned_plugins_.emplace_back(plugin); + + return plugin; +} + +bool PluginFactoryTensorRT::RegisterPlugin( + const std::string& op_name, PluginDeserializeFunc deserialize_func) { + if (Has(op_name)) return false; + auto ret = plugin_registry_.emplace(op_name, deserialize_func); + return ret.second; +} + +void PluginFactoryTensorRT::DestroyPlugins() { owned_plugins_.clear(); } + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h new file mode 100644 index 0000000000000000000000000000000000000000..139c75595f9f44cacf7d14cda6b1c8eb4ef3c0ee --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h @@ -0,0 +1,78 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h" +#include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class PluginFactoryTensorRT : public nvinfer1::IPluginFactory, + public DeleteHelper { + public: + // Deserialization method + PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data, + size_t serial_length) override; + + bool RegisterPlugin(const std::string& op_name, + PluginDeserializeFunc deserialize_func); + + bool Has(const std::string& op_name) { + return plugin_registry_.find(op_name) != plugin_registry_.end(); + } + + void DestroyPlugins(); + + protected: + std::unordered_map plugin_registry_; + + std::list> owned_plugins_; +}; + +class TrtPluginRegistrar { + public: + TrtPluginRegistrar(const std::string& name, + PluginDeserializeFunc deserialize_func) { + inference::Singleton::Global().RegisterPlugin( + name, deserialize_func); + } +}; + +#define REGISTER_TRT_PLUGIN(name, deserialize_func) \ + REGISTER_TRT_PLUGIN_UNIQ(__COUNTER__, name, deserialize_func) + +#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func) \ + static paddle::inference::tensorrt::plugin::TrtPluginRegistrar \ + trt_plugin_registrar##ctr __attribute__((unused)) = \ + paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \ + name, deserialize_func) + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/serialize.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h similarity index 96% rename from paddle/fluid/inference/tensorrt/plugin/serialize.h rename to paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h index ce859f16fc87479adf090687121ff06951b5684c..1cae4ccae4cc593785d9b3b0e87523e740eef4ff 100644 --- a/paddle/fluid/inference/tensorrt/plugin/serialize.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h @@ -13,8 +13,8 @@ // limitations under the License. #pragma once - #include +#include #include #include #include "paddle/fluid/platform/enforce.h" @@ -24,6 +24,13 @@ namespace inference { namespace tensorrt { namespace plugin { +// Some trt base classes lack of the destructor. +// We use a assisted class to fix this. +struct DeleteHelper { + protected: + virtual ~DeleteHelper() {} +}; + template inline void SerializeValue(void** buffer, T const& value); diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index 9eed0f6ee9ce4d9e35bec718dc8e8435921dbd81..a03dd45db0f80487cb4c2e6b68f94944e8558ae4 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -17,6 +17,8 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/platform/enforce.h" @@ -27,19 +29,34 @@ namespace tensorrt { class TensorRTEngineTest : public ::testing::Test { protected: void SetUp() override { - ASSERT_EQ(0, cudaStreamCreate(&stream_)); - engine_ = new TensorRTEngine(10, 1 << 10, stream_); + ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); + + engine_ = new TensorRTEngine(10, 1 << 10); engine_->InitNetwork(); } void TearDown() override { - delete engine_; - cudaStreamDestroy(stream_); + if (engine_) { + delete engine_; + engine_ = nullptr; + } + } + + void PrepareInputOutput(const std::vector &input, + std::vector output_shape) { + TensorFromVector(input, *ctx_, &input_); + output_.Resize(framework::make_ddim(output_shape)); + } + + void GetOutput(std::vector *output) { + TensorToVector(output_, *ctx_, output); } protected: - TensorRTEngine* engine_; - cudaStream_t stream_; + framework::Tensor input_; + framework::Tensor output_; + TensorRTEngine *engine_; + platform::CUDADeviceContext *ctx_; }; TEST_F(TensorRTEngineTest, add_layer) { @@ -48,12 +65,14 @@ TEST_F(TensorRTEngineTest, add_layer) { float raw_weight[size] = {2.}; // Weight in CPU memory. float raw_bias[size] = {3.}; + std::vector buffers(2); // TRT binded inputs + LOG(INFO) << "create weights"; TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, size); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, size); - auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW{1, 1, 1}); - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size, + auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size, weight.get(), bias.get()); PADDLE_ENFORCE(fc_layer != nullptr); @@ -63,18 +82,24 @@ TEST_F(TensorRTEngineTest, add_layer) { ASSERT_EQ(engine_->engine()->getNbBindings(), 2); // fill in real data - float x_v = 1234; - engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), - 1 * sizeof(float)); + std::vector x_v = {1234}; + std::vector y_cpu; + PrepareInputOutput(x_v, {1}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + LOG(INFO) << "to execute"; - engine_->Execute(1); + engine_->Execute(1, &buffers, ctx_->stream()); LOG(INFO) << "to get output"; - float y_cpu; - engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float)); + GetOutput(&y_cpu); LOG(INFO) << "to checkout output"; - ASSERT_EQ(y_cpu, x_v * 2 + 3); + ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3); } TEST_F(TensorRTEngineTest, add_layer_multi_dim) { @@ -83,12 +108,13 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { // instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]] float raw_weight[4] = {1.0, 1.1, 3.3, 4.4}; float raw_bias[2] = {1.3, 2.4}; + std::vector buffers(2); // TRT binded inputs TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 4); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 2); - auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW{1, 2, 1}); - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2, + auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2, weight.get(), bias.get()); PADDLE_ENFORCE(fc_layer != nullptr); @@ -96,19 +122,27 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { engine_->FreezeNetwork(); ASSERT_EQ(engine_->engine()->getNbBindings(), 2); - float x_v[2] = {1.0, 2.0}; - engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), - 2 * sizeof(float)); - engine_->Execute(1); + // fill in real data + std::vector x_v = {1.0, 2.0}; + std::vector y_cpu; + PrepareInputOutput(x_v, {2}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + + engine_->Execute(1, &buffers, ctx_->stream()); LOG(INFO) << "to get output"; - float y_cpu[2] = {-1., -1.}; + GetOutput(&y_cpu); auto dims = engine_->GetITensor("y")->getDimensions(); ASSERT_EQ(dims.nbDims, 3); ASSERT_EQ(dims.d[0], 2); ASSERT_EQ(dims.d[1], 1); - engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float)); + ASSERT_EQ(y_cpu[0], 4.5); ASSERT_EQ(y_cpu[1], 14.5); } @@ -117,12 +151,13 @@ TEST_F(TensorRTEngineTest, test_conv2d) { // Weight in CPU memory. float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; float raw_bias[1] = {0}; + std::vector buffers(2); // TRT binded inputs TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1); - auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3{1, 3, 3}); - auto* conv_layer = + auto *conv_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3}, weight.get(), bias.get()); PADDLE_ENFORCE(conv_layer != nullptr); @@ -133,28 +168,36 @@ TEST_F(TensorRTEngineTest, test_conv2d) { engine_->FreezeNetwork(); ASSERT_EQ(engine_->engine()->getNbBindings(), 2); - float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; - engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), - 18 * sizeof(float)); - engine_->Execute(2); + // fill in real data + std::vector x_v = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + std::vector y_cpu; + PrepareInputOutput(x_v, {18}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + + engine_->Execute(2, &buffers, ctx_->stream()); LOG(INFO) << "to get output"; - float* y_cpu = new float[18]; - engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float)); + GetOutput(&y_cpu); + ASSERT_EQ(y_cpu[0], 4.0); ASSERT_EQ(y_cpu[1], 6.0); } TEST_F(TensorRTEngineTest, test_pool2d) { // Weight in CPU memory. - auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3{1, 2, 2}); + std::vector buffers(2); // TRT binded inputs nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE; - auto* pool_layer = - TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast(x), - pool_t, nvinfer1::DimsHW{2, 2}); + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t, + nvinfer1::DimsHW{2, 2}); PADDLE_ENFORCE(pool_layer != nullptr); pool_layer->setStride(nvinfer1::DimsHW{1, 1}); @@ -164,14 +207,21 @@ TEST_F(TensorRTEngineTest, test_pool2d) { engine_->FreezeNetwork(); ASSERT_EQ(engine_->engine()->getNbBindings(), 2); - float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0}; - engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), - 8 * sizeof(float)); - engine_->Execute(2); + // fill in real data + std::vector x_v = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0}; + std::vector y_cpu; + PrepareInputOutput(x_v, {2}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + + engine_->Execute(2, &buffers, ctx_->stream()); LOG(INFO) << "to get output"; - float* y_cpu = new float[2]; - engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float)); + GetOutput(&y_cpu); ASSERT_EQ(y_cpu[0], 2.0); ASSERT_EQ(y_cpu[1], 5.0); diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 55ab04bfe16ec6a3d97c443f59c72e7b85fb1899..2f17a44e0c08ef7d9204a115512a1cd76790efdf 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -105,17 +105,24 @@ set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1") download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc) +# transformer, the dataset only works on batch_size=8 now +set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer") +download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp%2Ftransformer_model.tar.gz" "temp%2Ftransformer_data.txt.tar.gz") +inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 SERIAL) + # ocr set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") if (NOT EXISTS ${OCR_INSTALL_DIR}) - inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz") + inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Focr.tar.gz") endif() inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc SERIAL) # mobilenet with transpose op set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet") if (NOT EXISTS ${MOBILENET_INSTALL_DIR}) - inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz") + inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz") endif() inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL) diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc index 3f6c933f2bcc6ed5410cb95a48f5ee6869280fe4..5157bd280d0f3ee327d5cee7799477b5e6fd3f71 100644 --- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc @@ -107,6 +107,9 @@ void SetConfig(AnalysisConfig *cfg) { cfg->DisableGpu(); cfg->SwitchSpecifyInputNames(); cfg->SwitchIrOptim(); + if (FLAGS_zero_copy) { + cfg->SwitchUseFeedFetchOps(false); + } } void SetInput(std::vector> *inputs) { @@ -131,7 +134,7 @@ TEST(Analyzer_Pyramid_DNN, profile) { TestPrediction(reinterpret_cast(&cfg), input_slots_all, &outputs, FLAGS_num_threads); - if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) { PADDLE_ENFORCE_EQ(outputs.size(), 1UL); size_t size = GetSize(outputs[0]); PADDLE_ENFORCE_GT(size, 0); @@ -166,6 +169,19 @@ TEST(Analyzer_Pyramid_DNN, compare) { reinterpret_cast(&cfg), input_slots_all); } +// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy +TEST(Analyzer_Pyramid_DNN, compare_zero_copy) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + std::vector outputs_name; + outputs_name.emplace_back("cos_sim_2.tmp_0"); + CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), + input_slots_all, outputs_name); +} + // Compare Deterministic result TEST(Analyzer_Pyramid_DNN, compare_determine) { AnalysisConfig cfg; diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 36282b3efe5756da55b056c09e94aa352e3dcf8a..dcf4b38ce8a9230148738cfd0840ca96b0c7cf8c 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -207,6 +207,9 @@ void SetConfig(AnalysisConfig *cfg) { cfg->DisableGpu(); cfg->SwitchSpecifyInputNames(); cfg->SwitchIrOptim(); + if (FLAGS_zero_copy) { + cfg->SwitchUseFeedFetchOps(false); + } } void SetInput(std::vector> *inputs) { @@ -285,133 +288,17 @@ TEST(Analyzer_rnn1, multi_thread) { input_slots_all, &outputs, 2 /* multi_thread */); } -// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing -// on the complex RNN1 model. -TEST(Analyzer_rnn1, ZeroCopy) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - - PaddlePlace place; - - auto predictor = CreatePaddlePredictor(config); - - config.SwitchUseFeedFetchOps(true); - auto native_predictor = - CreatePaddlePredictor(config.ToNativeConfig()); - - config.SwitchUseFeedFetchOps( - true); // the analysis predictor needs feed/fetch. - auto analysis_predictor = CreatePaddlePredictor(config); - -#define NEW_TENSOR(name__) \ - auto name__##_tensor = predictor->GetInputTensor(#name__); - NEW_TENSOR(data_lod_attention); - NEW_TENSOR(cell_init); - NEW_TENSOR(data); - NEW_TENSOR(week); - NEW_TENSOR(minute); - NEW_TENSOR(hidden_init); - - // Prepare data for AnalysisPredictor - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(), - data_tensor.get(), hidden_init_tensor.get(), - week_tensor.get(), minute_tensor.get(), &data, - FLAGS_batch_size); - - // Prepare data for NativePredictor - std::vector> native_inputs; - SetInput(&native_inputs); - std::vector native_outputs; - std::vector analysis_outputs; - - auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1"); - // Run analysis predictor - - int num_ops; - auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); - ASSERT_TRUE(fuse_statis.count("fc_fuse")); - ASSERT_EQ(fuse_statis.at("fc_fuse"), 1); - ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM - ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); - ASSERT_EQ(num_ops, - 13); // After graph optimization, only 13 operators exists. - - Timer timer; - double total_time{0}; - for (int i = 0; i < FLAGS_repeat; i++) { - timer.tic(); - predictor->ZeroCopyRun(); - total_time += timer.toc(); - } - LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor); - - ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs)); - LOG(INFO) << "native output " << DescribeTensor(native_outputs.front()); - - int output_size{0}; // this is the number of elements not memory size - auto *zero_copy_data = output_tensor->data(&place, &output_size); - auto *native_data = static_cast(native_outputs.front().data.data()); - for (int i = 0; i < output_size; i++) { - EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3); - } -} - -TEST(Analyzer_rnn1, ZeroCopyMultiThread) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - -#define NEW_TENSOR(name__) \ - auto name__##_tensor = predictor->GetInputTensor(#name__); - - std::vector> predictors; - predictors.emplace_back(CreatePaddlePredictor(config)); - for (int tid = 1; tid < FLAGS_num_threads; tid++) { - predictors.emplace_back(predictors.front()->Clone()); - } - double total_time_of_threads{0}; - std::vector threads; - - for (int tid = 0; tid < FLAGS_num_threads; tid++) { - threads.emplace_back([&, tid] { - auto &predictor = predictors[tid]; - NEW_TENSOR(data_lod_attention); - NEW_TENSOR(cell_init); - NEW_TENSOR(data); - NEW_TENSOR(week); - NEW_TENSOR(minute); - NEW_TENSOR(hidden_init); - - // Prepare data for AnalysisPredictor - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - Timer timer; - double total_time{0}; - - for (int i = 0; i < FLAGS_repeat; i++) { - PrepareZeroCopyInputs(data_lod_attention_tensor.get(), - cell_init_tensor.get(), data_tensor.get(), - hidden_init_tensor.get(), week_tensor.get(), - minute_tensor.get(), &data, FLAGS_batch_size); - - timer.tic(); - predictor->ZeroCopyRun(); - total_time += timer.toc(); - } - - total_time_of_threads += total_time; - - LOG(INFO) << "thread time: " << total_time / FLAGS_repeat; - }); - } - - for (auto &t : threads) { - t.join(); - } +// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy +TEST(Analyzer_rnn1, compare_zero_copy) { + AnalysisConfig cfg; + SetConfig(&cfg); - LOG(INFO) << "average time: " - << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat; + std::vector> input_slots_all; + SetInput(&input_slots_all); + std::vector outputs_name; + outputs_name.emplace_back("final_output.tmp_1"); + CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), + input_slots_all, outputs_name); } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index cca2ab1ee148b568e714c24dded7cd72403f0e5f..19fa5528da4d11d2eb1a2f932f60a84c3f5468e7 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -144,6 +144,9 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) { cfg->SwitchSpecifyInputNames(); cfg->SwitchIrDebug(); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); + if (FLAGS_zero_copy) { + cfg->SwitchUseFeedFetchOps(false); + } if (use_mkldnn) { cfg->EnableMKLDNN(); } @@ -184,10 +187,10 @@ TEST(Analyzer_seq_pool1, compare_determine) { input_slots_all); } -void analysis_fuse_statis(bool use_zerocopy) { +// Check the fuse status +TEST(Analyzer_seq_pool1, fuse_statis) { AnalysisConfig cfg; SetConfig(&cfg); - cfg.SwitchUseFeedFetchOps(!use_zerocopy); int num_ops; auto predictor = CreatePaddlePredictor(cfg); auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); @@ -203,137 +206,17 @@ void analysis_fuse_statis(bool use_zerocopy) { EXPECT_EQ(num_ops, 171); } -// Check the fuse status -TEST(Analyzer_seq_pool1, fuse_statis) { analysis_fuse_statis(false); } - -void PrepareZeroCopyInputs( - const std::unique_ptr &predictor, - std::vector> *inputs) { - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - // only feed one batch - const auto &one_batch = data.NextBatch(); - inputs->clear(); - for (size_t i = 0; i < one_batch.size(); ++i) { - auto &slot = one_batch[i]; - auto tensor = predictor->GetInputTensor(slot.name + "_embed"); - tensor->Reshape(slot.shape); - tensor->SetLoD({slot.lod}); - ZeroCopyTensorAssignData(tensor.get(), slot.data); - inputs->emplace_back(std::move(tensor)); - } -} - -// return the output values -std::vector zerocopy_profile(int repeat_times) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - auto predictor = CreatePaddlePredictor(config); - std::vector> inputs; - PrepareZeroCopyInputs(predictor, &inputs); - auto output_tensor = predictor->GetOutputTensor(out_var_name); - Timer timer; - LOG(INFO) << "Warm up run..."; - timer.tic(); - predictor->ZeroCopyRun(); - PrintTime(FLAGS_batch_size, 1, 1, 0, timer.toc(), 1); - if (FLAGS_profile) { - paddle::platform::ResetProfiler(); - } - LOG(INFO) << "Run " << repeat_times << " times..."; - timer.tic(); - for (int i = 0; i < repeat_times; i++) { - predictor->ZeroCopyRun(); - } - PrintTime(FLAGS_batch_size, repeat_times, 1, 0, timer.toc() / repeat_times, - 1); - - LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor); - PaddlePlace place; - int output_size{0}; - auto *pdata = output_tensor->data(&place, &output_size); - std::vector res(output_size); - for (int i = 0; i < output_size; ++i) { - res[i] = pdata[i]; - } - return res; -} - -TEST(Analyzer_seq_pool1, zerocopy_profile) { zerocopy_profile(FLAGS_repeat); } - -TEST(Analyzer_seq_pool1, zerocopy_profile_threads) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(false); - - std::vector> predictors; - predictors.emplace_back(CreatePaddlePredictor(config)); - for (int tid = 1; tid < FLAGS_num_threads; tid++) { - predictors.emplace_back(predictors.front()->Clone()); - } - double total_time_of_threads{0}; - std::vector threads; - - for (int tid = 0; tid < FLAGS_num_threads; tid++) { - threads.emplace_back([&, tid] { - auto &predictor = predictors[tid]; - std::vector> inputs; - PrepareZeroCopyInputs(predictor, &inputs); - auto output_tensor = predictor->GetOutputTensor(out_var_name); - Timer timer; - double total_time{0}; - - LOG(INFO) << "Warm up run..."; - timer.tic(); - predictor->ZeroCopyRun(); - PrintTime(FLAGS_batch_size, 1, FLAGS_num_threads, tid, timer.toc(), 1); - if (FLAGS_profile) { - paddle::platform::ResetProfiler(); - } - int repeat_times = FLAGS_repeat; - LOG(INFO) << "Run " << repeat_times << " times..."; - timer.tic(); - - for (int i = 0; i < repeat_times; i++) { - predictor->ZeroCopyRun(); - } - total_time += timer.toc(); - total_time_of_threads += total_time; - - LOG(INFO) << "thread time: " << total_time / repeat_times; - }); - } - - for (auto &t : threads) { - t.join(); - } - - LOG(INFO) << "average time: " - << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat; -} - -TEST(Analyzer_seq_pool1, zerocopy_fuse_statis) { analysis_fuse_statis(true); } +// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy +TEST(Analyzer_seq_pool1, compare_zero_copy) { + AnalysisConfig cfg; + SetConfig(&cfg); -TEST(Analyzer_seq_pool1, zerocopy_compare_native) { - AnalysisConfig config; - SetConfig(&config); - config.SwitchUseFeedFetchOps(true); - auto predictor = CreatePaddlePredictor(config.ToNativeConfig()); - std::vector native_outputs; std::vector> input_slots_all; SetInput(&input_slots_all); - ASSERT_TRUE(predictor->Run(input_slots_all[0], &native_outputs)); - EXPECT_EQ(native_outputs.size(), 1UL); - - auto zerocopy_output = zerocopy_profile(1); - EXPECT_EQ(zerocopy_output.size() * sizeof(float), - native_outputs.front().data.length()); - auto *native_data = static_cast(native_outputs.front().data.data()); - for (size_t i = 0; i < zerocopy_output.size(); ++i) { - EXPECT_LT( - std::fabs((zerocopy_output[i] - native_data[i]) / zerocopy_output[i]), - 1e-3); - } + std::vector outputs_name; + outputs_name.emplace_back(out_var_name); + CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), + input_slots_all, outputs_name); } } // namespace analysis diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..a925da312cde30380b4997b8b76a0d425a71e817 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc @@ -0,0 +1,241 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +struct DataRecord { + std::vector> src_word, src_pos, trg_word, init_idx; + std::vector> src_slf_attn_bias, init_score, + trg_src_attn_bias; + std::vector> batch_data_shape; + std::vector> lod; + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= src_word.size()) { + data.src_word.assign(src_word.begin() + batch_iter, + src_word.begin() + batch_end); + data.src_pos.assign(src_pos.begin() + batch_iter, + src_pos.begin() + batch_end); + data.src_slf_attn_bias.assign(src_slf_attn_bias.begin() + batch_iter, + src_slf_attn_bias.begin() + batch_end); + data.trg_word.assign(trg_word.begin() + batch_iter, + trg_word.begin() + batch_end); + data.init_score.assign(init_score.begin() + batch_iter, + init_score.begin() + batch_end); + data.init_idx.assign(init_idx.begin() + batch_iter, + init_idx.begin() + batch_end); + data.trg_src_attn_bias.assign(trg_src_attn_bias.begin() + batch_iter, + trg_src_attn_bias.begin() + batch_end); + std::vector batch_shape = + *(batch_data_shape.begin() + batch_iter); + data.batch_data_shape.push_back(batch_shape); + data.lod.resize(2); + for (int i = 0; i < batch_shape[0] + 1; i++) { + data.lod[0].push_back(i); + data.lod[1].push_back(i); + } + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + size_t num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, ',', &data); + CHECK_EQ(data.size(), static_cast(8)); + // load src_word + std::vector src_word_data; + split_to_int64(data[0], ' ', &src_word_data); + src_word.push_back(std::move(src_word_data)); + // load src_pos + std::vector src_pos_data; + split_to_int64(data[1], ' ', &src_pos_data); + src_pos.push_back(std::move(src_pos_data)); + // load src_slf_attn_bias + std::vector src_slf_attn_bias_data; + split_to_float(data[2], ' ', &src_slf_attn_bias_data); + src_slf_attn_bias.push_back(std::move(src_slf_attn_bias_data)); + // load trg_word + std::vector trg_word_data; + split_to_int64(data[3], ' ', &trg_word_data); + trg_word.push_back(std::move(trg_word_data)); + // load init_score + std::vector init_score_data; + split_to_float(data[4], ' ', &init_score_data); + init_score.push_back(std::move(init_score_data)); + // load init_idx + std::vector init_idx_data; + split_to_int64(data[5], ' ', &init_idx_data); + init_idx.push_back(std::move(init_idx_data)); + // load trg_src_attn_bias + std::vector trg_src_attn_bias_data; + split_to_float(data[6], ' ', &trg_src_attn_bias_data); + trg_src_attn_bias.push_back(std::move(trg_src_attn_bias_data)); + // load shape for variant data shape + std::vector batch_data_shape_data; + split_to_int(data[7], ' ', &batch_data_shape_data); + batch_data_shape.push_back(std::move(batch_data_shape_data)); + } + num_samples = num_lines; + } +}; + +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + auto one_batch = data->NextBatch(); + batch_size = one_batch.batch_data_shape[0][0]; + auto n_head = one_batch.batch_data_shape[0][1]; + auto trg_seq_len = one_batch.batch_data_shape[0][2]; // 1 for inference + auto src_seq_len = one_batch.batch_data_shape[0][3]; + + PaddleTensor src_word, src_pos, src_slf_attn_bias, trg_word, init_score, + init_idx, trg_src_attn_bias; + + src_word.name = "src_word"; + src_word.shape.assign({batch_size, src_seq_len, 1}); + src_word.dtype = PaddleDType::INT64; + TensorAssignData(&src_word, one_batch.src_word); + + src_pos.name = "src_pos"; + src_pos.shape.assign({batch_size, src_seq_len, 1}); + src_pos.dtype = PaddleDType::INT64; + TensorAssignData(&src_pos, one_batch.src_pos); + + src_slf_attn_bias.name = "src_slf_attn_bias"; + src_slf_attn_bias.shape.assign( + {batch_size, n_head, src_seq_len, src_seq_len}); + src_slf_attn_bias.dtype = PaddleDType::FLOAT32; + TensorAssignData(&src_slf_attn_bias, one_batch.src_slf_attn_bias); + + trg_word.name = "trg_word"; + trg_word.shape.assign({batch_size, 1}); + trg_word.dtype = PaddleDType::INT64; + trg_word.lod.assign(one_batch.lod.begin(), one_batch.lod.end()); + TensorAssignData(&trg_word, one_batch.trg_word); + + init_score.name = "init_score"; + init_score.shape.assign({batch_size, 1}); + init_score.dtype = PaddleDType::FLOAT32; + init_score.lod.assign(one_batch.lod.begin(), one_batch.lod.end()); + TensorAssignData(&init_score, one_batch.init_score); + + init_idx.name = "init_idx"; + init_idx.shape.assign({batch_size}); + init_idx.dtype = PaddleDType::INT32; + TensorAssignData(&init_idx, one_batch.init_idx); + + trg_src_attn_bias.name = "trg_src_attn_bias"; + trg_src_attn_bias.shape.assign( + {batch_size, n_head, trg_seq_len, src_seq_len}); + trg_src_attn_bias.dtype = PaddleDType::FLOAT32; + TensorAssignData(&trg_src_attn_bias, one_batch.trg_src_attn_bias); + + input_slots->assign({src_word, src_pos, src_slf_attn_bias, trg_word, + init_score, init_idx, trg_src_attn_bias}); +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); +} + +void SetInput(std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int test_batch_num = + FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1; + LOG(INFO) << "The number of samples to be test: " + << test_batch_num * FLAGS_batch_size; + for (int bid = 0; bid < test_batch_num; ++bid) { + input_slots.clear(); + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + (*inputs).emplace_back(input_slots); + } +} + +// Easy for profiling independently. +void profile(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + std::vector outputs; + if (use_mkldnn) { + cfg.EnableMKLDNN(); + } + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); +} + +TEST(Analyzer_Transformer, profile) { profile(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); } +#endif + +// Check the fuse status +TEST(Analyzer_Transformer, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); +} + +// Compare result of NativeConfig and AnalysisConfig +// void compare(bool use_mkldnn = false) { +// AnalysisConfig cfg; +// SetConfig(&cfg); +// if (use_mkldnn) { +// cfg.EnableMKLDNN(); +// } +// +// std::vector> input_slots_all; +// SetInput(&input_slots_all); +// CompareNativeAndAnalysis( +// reinterpret_cast(&cfg), +// input_slots_all); +// } + +// TODO(yihuaxu): +// Disable compare and compare_mkldnn temporary, see +// https://github.com/paddlePaddle/Paddle/issues/16316 for details. +// TEST(Analyzer_Transformer, compare) { compare(); } +// #ifdef PADDLE_WITH_MKLDNN +// TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); +// } +// #endif + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 2e53fddfe7f6f0c5b31ff069fb1661f143022841..a4881afe58a03902556ddb8a057c5f0579e4d1d2 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -25,7 +25,6 @@ #ifdef WITH_GPERFTOOLS #include #endif - #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/analysis/analyzer.h" @@ -51,6 +50,7 @@ DEFINE_bool(use_analysis, true, DEFINE_bool(record_benchmark, false, "Record benchmark after profiling the model"); DEFINE_double(accuracy, 1e-3, "Result Accuracy."); +DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch."); DECLARE_bool(profile); DECLARE_int32(paddle_num_threads); @@ -68,6 +68,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { LOG(INFO) << analysis_config->ToNativeConfig(); } +// Compare result between two PaddleTensor void CompareResult(const std::vector &outputs, const std::vector &ref_outputs) { EXPECT_GT(outputs.size(), 0UL); @@ -97,6 +98,58 @@ void CompareResult(const std::vector &outputs, } break; } + case PaddleDType::INT32: { + int32_t *pdata = static_cast(out.data.data()); + int32_t *pdata_ref = static_cast(ref_out.data.data()); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } + } + } +} + +// Compare result between a PaddleTensor and a ZeroCopyTensor +void CompareResult(const std::vector &outputs, + const std::vector &ref_outputs) { + EXPECT_GT(outputs.size(), 0UL); + EXPECT_EQ(outputs.size(), ref_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &ref_out = ref_outputs[i]; + size_t size = VecReduceToInt(out.shape); + EXPECT_GT(size, 0UL); + int ref_size = 0; // this is the number of elements not memory size + PaddlePlace place; + switch (out.dtype) { + case PaddleDType::INT64: { + int64_t *pdata = static_cast(out.data.data()); + int64_t *pdata_ref = ref_out.data(&place, &ref_size); + EXPECT_EQ(size, ref_size); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } + case PaddleDType::FLOAT32: { + float *pdata = static_cast(out.data.data()); + float *pdata_ref = ref_out.data(&place, &ref_size); + EXPECT_EQ(size, ref_size); + for (size_t j = 0; j < size; ++j) { + CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy); + } + break; + } + case PaddleDType::INT32: { + int32_t *pdata = static_cast(out.data.data()); + int32_t *pdata_ref = ref_out.data(&place, &ref_size); + EXPECT_EQ(size, ref_size); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } } } } @@ -198,61 +251,106 @@ void GetInputPerBatch(const std::vector> &in, } } -void TestOneThreadPrediction( - const PaddlePredictor::Config *config, - const std::vector> &inputs, - std::vector *outputs, bool use_analysis = true) { - int batch_size = FLAGS_batch_size; - int num_times = FLAGS_repeat; - auto predictor = CreateTestPredictor(config, use_analysis); +void ConvertPaddleTensorToZeroCopyTensor( + PaddlePredictor *predictor, const std::vector &inputs) { + for (size_t i = 0; i < inputs.size(); i++) { + auto input = inputs[i]; + auto tensor = predictor->GetInputTensor(input.name); + tensor->Reshape(input.shape); + tensor->SetLoD({input.lod}); + if (input.dtype == PaddleDType::INT64) { + ZeroCopyTensorAssignData(tensor.get(), input.data); + } else if (input.dtype == PaddleDType::FLOAT32) { + ZeroCopyTensorAssignData(tensor.get(), input.data); + } else if (input.dtype == PaddleDType::INT32) { + ZeroCopyTensorAssignData(tensor.get(), input.data); + } else { + LOG(ERROR) << "unsupported feed type " << input.dtype; + } + } +} - // warmup run - LOG(INFO) << "Warm up run..."; - { - Timer warmup_timer; - warmup_timer.tic(); +void PredictionWarmUp(PaddlePredictor *predictor, + const std::vector> &inputs, + std::vector *outputs, int num_threads, + int tid) { + int batch_size = FLAGS_batch_size; + LOG(INFO) << "Running thread " << tid << ", warm up run..."; + if (FLAGS_zero_copy) { + ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]); + } + Timer warmup_timer; + warmup_timer.tic(); + if (!FLAGS_zero_copy) { predictor->Run(inputs[0], outputs, batch_size); - PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1); - if (FLAGS_profile) { - paddle::platform::ResetProfiler(); - } + } else { + predictor->ZeroCopyRun(); } + PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1); + if (FLAGS_profile) { + paddle::platform::ResetProfiler(); + } +} - LOG(INFO) << "Run " << num_times << " times..."; - { - Timer run_timer; - run_timer.tic(); +void PredictionRun(PaddlePredictor *predictor, + const std::vector> &inputs, + std::vector *outputs, int num_threads, + int tid) { + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + LOG(INFO) << "Thread " << tid << " run " << num_times << " times..."; + Timer run_timer; + double elapsed_time = 0; #ifdef WITH_GPERFTOOLS - ProfilerStart("paddle_inference.prof"); + ProfilerStart("paddle_inference.prof"); #endif - for (int i = 0; i < num_times; i++) { - for (size_t j = 0; j < inputs.size(); j++) { - predictor->Run(inputs[j], outputs, batch_size); + if (!FLAGS_zero_copy) { + run_timer.tic(); + for (size_t i = 0; i < inputs.size(); i++) { + for (int j = 0; j < num_times; j++) { + predictor->Run(inputs[i], outputs, batch_size); } } + elapsed_time = run_timer.toc(); + } else { + for (size_t i = 0; i < inputs.size(); i++) { + ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]); + run_timer.tic(); + for (int j = 0; j < num_times; j++) { + predictor->ZeroCopyRun(); + } + elapsed_time += run_timer.toc(); + } + } #ifdef WITH_GPERFTOOLS - ProfilerStop(); + ProfilerStop(); #endif - double latency = run_timer.toc() / (num_times > 1 ? num_times : 1); - PrintTime(batch_size, num_times, 1, 0, latency, inputs.size()); - if (FLAGS_record_benchmark) { - Benchmark benchmark; - benchmark.SetName(FLAGS_model_name); - benchmark.SetBatchSize(batch_size); - benchmark.SetLatency(latency); - benchmark.PersistToFile("benchmark_record.txt"); - } + PrintTime(batch_size, num_times, num_threads, tid, elapsed_time / num_times, + inputs.size()); + if (FLAGS_record_benchmark) { + Benchmark benchmark; + benchmark.SetName(FLAGS_model_name); + benchmark.SetBatchSize(batch_size); + benchmark.SetLatency(elapsed_time / num_times); + benchmark.PersistToFile("benchmark_record.txt"); } } +void TestOneThreadPrediction( + const PaddlePredictor::Config *config, + const std::vector> &inputs, + std::vector *outputs, bool use_analysis = true) { + auto predictor = CreateTestPredictor(config, use_analysis); + PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0); + PredictionRun(predictor.get(), inputs, outputs, 1, 0); +} + void TestMultiThreadPrediction( const PaddlePredictor::Config *config, const std::vector> &inputs, std::vector *outputs, int num_threads, bool use_analysis = true) { - int batch_size = FLAGS_batch_size; - int num_times = FLAGS_repeat; std::vector threads; std::vector> predictors; predictors.emplace_back(CreateTestPredictor(config, use_analysis)); @@ -260,7 +358,6 @@ void TestMultiThreadPrediction( predictors.emplace_back(predictors.front()->Clone()); } - size_t total_time{0}; for (int tid = 0; tid < num_threads; ++tid) { threads.emplace_back([&, tid]() { // Each thread should have local inputs and outputs. @@ -273,34 +370,8 @@ void TestMultiThreadPrediction( ->SetMkldnnThreadID(static_cast(tid) + 1); } #endif - - // warmup run - LOG(INFO) << "Running thread " << tid << ", warm up run..."; - { - Timer warmup_timer; - warmup_timer.tic(); - predictor->Run(inputs[0], outputs, batch_size); - PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1); - if (FLAGS_profile) { - paddle::platform::ResetProfiler(); - } - } - - LOG(INFO) << "Thread " << tid << " run " << num_times << " times..."; - { - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - for (const auto &input : inputs) { - ASSERT_TRUE(predictor->Run(input, &outputs_tid)); - } - } - - auto time = timer.toc(); - total_time += time; - PrintTime(batch_size, num_times, num_threads, tid, time / num_times, - inputs.size()); - } + PredictionWarmUp(predictor.get(), inputs, outputs, num_threads, tid); + PredictionRun(predictor.get(), inputs, outputs, num_threads, tid); }); } for (int i = 0; i < num_threads; ++i) { @@ -360,6 +431,31 @@ void CompareNativeAndAnalysis( CompareResult(analysis_outputs, native_outputs); } +void CompareAnalysisAndZeroCopy( + PaddlePredictor::Config *config, + const std::vector> &inputs, + const std::vector &outputs_name) { + int batch_size = FLAGS_batch_size; + // analysis + std::vector analysis_outputs; + auto predictor = CreateTestPredictor(config, true); + predictor->Run(inputs[0], &analysis_outputs, batch_size); + // analysis + zero_copy + std::vector zerocopy_outputs; + reinterpret_cast(config)->SwitchUseFeedFetchOps(false); + predictor = CreateTestPredictor(config, true); + ConvertPaddleTensorToZeroCopyTensor(predictor.get(), inputs[0]); + predictor->ZeroCopyRun(); + for (size_t i = 0; i < outputs_name.size(); i++) { + ZeroCopyTensor zerocopy_output = + *predictor->GetOutputTensor(outputs_name[i]).get(); + zerocopy_outputs.emplace_back(zerocopy_output); + LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(zerocopy_output); + } + // compare + CompareResult(analysis_outputs, zerocopy_outputs); +} + template std::string LoDTensorSummary(const framework::LoDTensor &tensor) { std::stringstream ss; diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 17a433c9d98768dbda4ba93bdceb6cc1717adc07..cb668a4174134ba3ce9517955ff740ada568e97b 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -54,7 +54,8 @@ void SetConfig(AnalysisConfig* config, std::string model_dir, if (use_gpu) { config->EnableUseGpu(100, 0); if (use_tensorrt) { - config->EnableTensorRtEngine(1 << 10, batch_size); + config->EnableTensorRtEngine(1 << 10, batch_size, 3, + AnalysisConfig::Precision::kFloat32, false); config->pass_builder()->DeletePass("conv_bn_fuse_pass"); config->pass_builder()->DeletePass("fc_fuse_pass"); config->pass_builder()->TurnOnDebug(); diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index 6c5fe043ffa3f3dcafe2dbbebd6244467f859abf..df7af71d9b32ba11822e066f574146cfa5c50edd 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -1,5 +1,5 @@ include(ExternalProject) -set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url") +set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url") set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING "A path setting inference demo download directories.") @@ -30,19 +30,20 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME) ${EXTERNAL_PROJECT_NAME} ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${INSTALL_DIR} - URL ${URL}/${FILENAME} + DOWNLOAD_COMMAND wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && + ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME} DOWNLOAD_DIR ${INSTALL_DIR} DOWNLOAD_NO_PROGRESS 1 CONFIGURE_COMMAND "" BUILD_COMMAND "" UPDATE_COMMAND "" - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${UNPACK_DIR} ${INSTALL_DIR} + INSTALL_COMMAND "" ) endfunction() set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") -if (NOT EXISTS ${WORD2VEC_INSTALL_DIR}) - inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") +if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32) + inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") endif() set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model") diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt index c43eaf7f9849ee4a88ed95bdb8b6966da8760435..2104e4ac7222258ee025bd5acd60b1db251df654 100644 --- a/paddle/fluid/inference/utils/CMakeLists.txt +++ b/paddle/fluid/inference/utils/CMakeLists.txt @@ -1,4 +1,2 @@ cc_library(benchmark SRCS benchmark.cc DEPS enforce) cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) -cc_binary(visualizer SRCS visualizer.cc DEPS analysis - paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes) diff --git a/paddle/fluid/inference/utils/singleton.h b/paddle/fluid/inference/utils/singleton.h index cfb89e704457a11a3cd6e89dba5efad5acae0bce..990bef359499834c3a7cb025c3fb1d94ceea958e 100644 --- a/paddle/fluid/inference/utils/singleton.h +++ b/paddle/fluid/inference/utils/singleton.h @@ -45,13 +45,13 @@ struct Registry { } template - static void Register(const std::string& name) { + void Register(const std::string& name) { PADDLE_ENFORCE_EQ(items_.count(name), 0); items_[name] = new ItemChild; } - static ItemParent* Lookup(const std::string& name, - const std::string& default_name = "") { + ItemParent* Lookup(const std::string& name, + const std::string& default_name = "") { auto it = items_.find(name); if (it == items_.end()) { if (default_name == "") @@ -70,11 +70,8 @@ struct Registry { private: Registry() = default; - static std::unordered_map items_; + std::unordered_map items_; }; -template -std::unordered_map Registry::items_; - } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/utils/visualizer.cc b/paddle/fluid/inference/utils/visualizer.cc deleted file mode 100644 index 7c0dd64dea88e51b24c4bc04818d633ee0d2f722..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/utils/visualizer.cc +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/inference/utils/visualizer.h" -#include -#include -#include -#include -#include "paddle/fluid/framework/ir/graph_viz_pass.h" -#include "paddle/fluid/inference/analysis/analyzer.h" -#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" -#include "paddle/fluid/platform/init.h" - -DEFINE_string(model_dir, "", "model directory"); -DEFINE_string(model_program_path, "", "model program path"); -DEFINE_string(model_params_path, "", "model params path"); - -using paddle::inference::analysis::Argument; - -namespace paddle { -namespace inference { -namespace utils { - -void Visualizer::SetArgument(Argument *argument) { argument_ = argument; } - -bool Visualizer::Run() { - paddle::framework::InitDevices(false); - paddle::inference::analysis::Analyzer().Run(argument_); - return true; -} - -} // namespace utils -} // namespace inference -} // namespace paddle - -// Generate a dot file describing the structure of graph. -// To use this tool, run command: ./visualizer [options...] -// Options: -// --model_dir: the directory of model -// --model_program_path: the path of program -// --model_params_path: the path of params -int main(int argc, char *argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, true); - google::InitGoogleLogging(argv[0]); - - paddle::inference::analysis::Argument argument; - argument.SetUseGPU(false); - argument.SetUseTensorRT(false); - - if (FLAGS_model_dir.empty()) { - if (FLAGS_model_program_path.empty() || FLAGS_model_params_path.empty()) { - LOG(ERROR) << "Please set model_dir" - " or model_program_path and model_params_path"; - return -1; - } else { - argument.SetModelProgramPath(FLAGS_model_program_path); - argument.SetModelParamsPath(FLAGS_model_params_path); - } - } else { - argument.SetModelDir(FLAGS_model_dir); - } - - // Only 1 pass, default filename is 0_ir_origin.dot - // For more details, looking for paddle::inference::analysis::IRPassManager - argument.SetIrAnalysisPasses({"infer_clean_graph_pass", "graph_viz_pass"}); - - std::unique_ptr scope{ - new paddle::framework::Scope()}; - argument.SetScopeNotOwned( - const_cast(scope.get())); - - paddle::inference::utils::Visualizer visualizer; - visualizer.SetArgument(&argument); - visualizer.Run(); - - return 0; -} - -USE_PASS(infer_clean_graph_pass); -USE_PASS(graph_viz_pass); -USE_PASS(graph_to_program_pass); diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index e7268077643c3988c59a52bf54873f1e8db4619b..7eb663ea280e65f3c10304aa47c9970df099b901 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -1,6 +1,6 @@ add_subdirectory(detail) add_subdirectory(allocation) -cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade) +cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade profiler) cc_library(memcpy SRCS memcpy.cc DEPS place) cc_library(memory diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 4b7b9064dcde9b5209264257d51bbd976ba8eb85..0f6014ae8aa28f090cb51401ee2cb0772bca7a45 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -3,7 +3,8 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator) cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) -cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator) +cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler) +cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator) if (WITH_GPU) @@ -37,28 +38,20 @@ else () set(AllocatorFacadeDeps) endif() +list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator zero_size_allocator) + cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator) -cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator) cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator) -cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags) -cc_library(allocator_facade SRCS allocator_facade.cc DEPS - ${AllocatorFacadeDeps} - cpu_allocator - locked_allocator - best_fit_allocator - aligned_allocator - auto_increment_allocator - zero_size_allocator - conditional_allocator - retry_allocator - buffered_allocator - allocator_strategy - legacy_allocator - ) +cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) +cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade) cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator) -cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade) +cc_test(naive_best_fit_allocator_facade_test SRCS naive_best_fit_allocator_facade_test.cc DEPS allocator_facade) + +cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade) + +cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade) diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h index fc1a8e9247b16374037bfde44449fd552b44c6b4..b536d4276e3b6236d0748eee588d345dd15c6954 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.h +++ b/paddle/fluid/memory/allocation/aligned_allocator.h @@ -14,6 +14,7 @@ #pragma once #include +#include #include "paddle/fluid/memory/allocation/allocator.h" namespace paddle { @@ -93,6 +94,8 @@ class AlignedAllocator : public ThinAlignedAllocator { underlying_allocator_->Allocate(size + kAlignment, attr); return new AlignedAllocation(std::move(raw_allocation), size); } + + void FreeImpl(Allocation* allocation) override { delete allocation; } }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc index 8fb8a5fb897a736d7515951ba08c633da9a7706c..5a5253d911abc722c026730e7e88eb326bb82afd 100644 --- a/paddle/fluid/memory/allocation/allocator.cc +++ b/paddle/fluid/memory/allocation/allocator.cc @@ -27,16 +27,24 @@ bool Allocator::IsAllocThreadSafe() const { return false; } AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) { auto ptr = AllocateImpl(size, attr); - ptr->set_allocator(this); + ptr->RegisterDecoratedAllocator(this); return AllocationPtr(ptr); } -void Allocator::Free(Allocation* allocation) { delete allocation; } +void Allocator::FreeImpl(Allocation* allocation) { + Allocator* allocator = allocation->TopDecoratedAllocator(); + allocator->Free(allocation); +} + +void Allocator::Free(Allocation* allocation) { + allocation->PopDecoratedAllocator(); + FreeImpl(allocation); +} const char* BadAlloc::what() const noexcept { return msg_.c_str(); } void AllocationDeleter::operator()(Allocation* allocation) const { - auto* allocator = allocation->allocator(); + Allocator* allocator = allocation->TopDecoratedAllocator(); allocator->Free(allocation); } diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index f2b6f438c382275cab4ecf9aceea1c55e5885dee..33b816b90812d7fedc450a67743b5d7d20579302 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -15,6 +15,8 @@ #pragma once #include #include +#include +#include #include "paddle/fluid/platform/place.h" namespace paddle { @@ -44,13 +46,56 @@ class Allocator; // NOTE: this is the base class of Allocation. Each allocator can use its own // allocation object. // NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0 + +/** + * Allocation is returned by Allocator::Allocate() method. + * + * An allocator may be decorated by another allocator. For example, we can + * decorate + * a RetryAllocator to any allocator to perform allocation retry when first + * allocation request fails. + * + * Explanations of Allocator design is as follows: + * + * Suppose we have an allocator which is decorated by several allocators: + * + * A(1) <- A(2) <- A(3) <- ... <- A(n) + * + * , and the public allocator is A(1). + * + * The allocation process would be: + * + * A(n).Allocate() -> ... -> A(2).Allocate() -> A(1).Allocate() + * + * , and the free process would be: + * + * A(1).Free() -> A(2).Free() -> ... -> A(n).Free() + * + * Therefore, we should record the allocator chain when allocating, so + * that we can free the allocation in the reverse order of allocator chain. + * The field `decorated_allocators_` is used to record this chain. + * + * Another example is that we want to add additional fields in Allocation, + * e.g., something what is done in AlignedAllocator, etc. + * In this case, we should declare a derived class of Allocation, which + * contains an underlying Allocation allocated by the underlying allocator. + * Therefore, `decorated_allocators_` of the new Allocation object would + * be a new chain, differing from the underlying Allocation object. + */ class Allocation { public: Allocation(void* ptr, size_t size, platform::Place place) - : allocator_(nullptr), ptr_(ptr), size_(size), place_(place) {} + : ptr_(ptr), size_(size), place_(place) { + // NOTE(zjl): Since decorated_allocators_ is usually a small vector + // We reserve a small buffer to it to prevent frequent heap allocation + // Not quite sure whether we need something like gtl vector. + decorated_allocators_.reserve(8); + } Allocation(const Allocation& o) = delete; Allocation& operator=(const Allocation& o) = delete; + Allocation(Allocation&& o) = delete; + Allocation& operator=(Allocation&& o) = delete; // Returns the holding pointer. // NOTE: For performance consideration, it is better not to make this method @@ -72,17 +117,31 @@ class Allocation { const platform::Place& place() const { return place_; } - Allocator* allocator() { return allocator_; } + virtual ~Allocation(); - void set_allocator(Allocator* allocator) { allocator_ = allocator; } + private: + const std::vector& DecoratedAllocators() const { + return decorated_allocators_; + } - virtual ~Allocation(); + inline void RegisterDecoratedAllocator(Allocator* allocator) { + decorated_allocators_.push_back(allocator); + } + + inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); } + + inline Allocator* TopDecoratedAllocator() { + return decorated_allocators_.back(); + } private: - Allocator* allocator_; void* ptr_; size_t size_; platform::Place place_; + std::vector decorated_allocators_; + + friend class Allocator; + friend class AllocationDeleter; }; using AllocationPtr = std::unique_ptr; @@ -132,9 +191,12 @@ class Allocator { // True if the `Allocate` is thread safe. virtual bool IsAllocThreadSafe() const; + // This function should not be called outside + void Free(Allocation* allocation); + protected: - virtual void Free(Allocation* allocation); virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0; + virtual void FreeImpl(Allocation* allocation); private: friend class AllocationDeleter; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index ea0b729dc6f62f517877e060cb0ecbe5c1d22e61..09328aded58cb0cccd9de0aba399f5c49313042f 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" @@ -30,6 +31,7 @@ #include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/memory/allocation/zero_size_allocator.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/memory/allocation/cuda_allocator.h" @@ -47,6 +49,17 @@ namespace paddle { namespace memory { namespace allocation { +static inline std::shared_ptr WrapRetryAllocator( + std::shared_ptr allocator, int64_t retry_time) { + if (retry_time > 0) { + auto* retry_allocator = + new RetryAllocator(std::move(allocator), retry_time); + allocator.reset(retry_allocator); + } + + return allocator; +} + // TODO(yy): Dirty code here. This class should be configurable in runtime. class CPUManagedAllocator : public Allocator { public: @@ -110,14 +123,10 @@ class ChunkedAllocator : public Allocator { std::shared_ptr CreateAllocatorWithChunk() { chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_)); auto* allocation = chunks_.back().get(); - std::unique_ptr allocator(new LockedAllocator( - std::unique_ptr(new BestFitAllocator(allocation)))); + std::shared_ptr allocator(new LockedAllocator( + std::shared_ptr(new BestFitAllocator(allocation)))); - if (retry_time_ > 0) { - auto* retry_allocator = - new RetryAllocator(std::move(allocator), retry_time_); - allocator.reset(retry_allocator); - } + allocator = WrapRetryAllocator(allocator, retry_time_); return std::make_shared>(std::move(allocator)); } @@ -188,13 +197,23 @@ class AllocatorFacadePrivate { ~AllocatorFacadePrivate() = default; AllocatorFacadePrivate() { - if (GetAllocatorStrategy() == AllocatorStrategy::kLegacy) { - InitLegacyAllocator(); - } else { - InitCPUAllocator(); - InitCUDAAllocator(); - InitCUDAPinnedAllocator(); - WrapZeroSizeAllocator(); + auto strategy = GetAllocatorStrategy(); + switch (strategy) { + case AllocatorStrategy::kLegacy: { + InitLegacyAllocator(); + break; + } + case AllocatorStrategy::kNaiveBestFit: { + InitCPUAllocator(); + InitCUDAAllocator(); + InitCUDAPinnedAllocator(); + WrapZeroSizeAllocator(); + break; + } + default: { + PADDLE_THROW("Unsupported allocator strategy: %d", + static_cast(strategy)); + } } } @@ -252,8 +271,7 @@ AllocatorFacade& AllocatorFacade::Instance() { std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size, Allocator::Attr attr) { - return std::shared_ptr(Alloc(place, size, attr).release(), - AllocationDeleter()); + return std::shared_ptr(Alloc(place, size, attr)); } AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..67905973ff620a7e0fb863fef80778aceba7aeb2 --- /dev/null +++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include +#include + +#ifdef PADDLE_WITH_CUDA +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_double(fraction_of_cuda_pinned_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); +DECLARE_int64(gpu_allocator_retry_time); +#endif + +namespace paddle { +namespace memory { +namespace allocation { + +//! Run allocate test cases for different places +void AllocateTestCases() { + auto &instance = AllocatorFacade::Instance(); + platform::Place place; + size_t size = 1024; + + { + place = platform::CPUPlace(); + size = 1024; + auto cpu_allocation = instance.Alloc(place, size); + ASSERT_NE(cpu_allocation, nullptr); + ASSERT_NE(cpu_allocation->ptr(), nullptr); + ASSERT_EQ(cpu_allocation->place(), place); + ASSERT_EQ(cpu_allocation->size(), size); + } + +#ifdef PADDLE_WITH_CUDA + { + place = platform::CUDAPlace(0); + size = 1024; + auto gpu_allocation = instance.Alloc(place, size); + ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), size); + } + + { + // Allocate 2GB gpu memory + place = platform::CUDAPlace(0); + size = 2 * static_cast(1 << 30); + auto gpu_allocation = instance.Alloc(place, size); + ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), size); + } + + { + place = platform::CUDAPinnedPlace(); + size = (1 << 20); + auto cuda_pinned_allocation = + instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20); + ASSERT_NE(cuda_pinned_allocation, nullptr); + ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr); + ASSERT_EQ(cuda_pinned_allocation->place(), place); + ASSERT_GE(cuda_pinned_allocation->size(), size); + } +#endif +} + +TEST(Allocator, SpecifyGpuMemory) { +#ifdef PADDLE_WITH_CUDA + // Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and + // FLAGS_reallocate_gpu_memory_in_mb + FLAGS_fraction_of_gpu_memory_to_use = 0.0; + // 512 MB + FLAGS_initial_gpu_memory_in_mb = 512; + // 4 MB + FLAGS_reallocate_gpu_memory_in_mb = 4; + FLAGS_gpu_allocator_retry_time = 500; + FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; +#endif + + AllocateTestCases(); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..decdc62f1361a9c159b8ccb09910e0f164b35210 --- /dev/null +++ b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include +#include + +#ifdef PADDLE_WITH_CUDA +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_double(fraction_of_cuda_pinned_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); +DECLARE_int64(gpu_allocator_retry_time); +#endif + +namespace paddle { +namespace memory { +namespace allocation { + +//! Run allocate test cases for different places +void AllocateTestCases() { + auto &instance = AllocatorFacade::Instance(); + platform::Place place; + size_t size = 1024; + + { + place = platform::CPUPlace(); + size = 1024; + auto cpu_allocation = instance.Alloc(place, size); + ASSERT_NE(cpu_allocation, nullptr); + ASSERT_NE(cpu_allocation->ptr(), nullptr); + ASSERT_EQ(cpu_allocation->place(), place); + ASSERT_EQ(cpu_allocation->size(), size); + } + +#ifdef PADDLE_WITH_CUDA + { + place = platform::CUDAPlace(0); + size = 1024; + auto gpu_allocation = instance.Alloc(place, size); + ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), size); + } + + { + // Allocate 2GB gpu memory + place = platform::CUDAPlace(0); + size = 2 * static_cast(1 << 30); + auto gpu_allocation = instance.Alloc(place, size); + ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), size); + } + + { + place = platform::CUDAPinnedPlace(); + size = (1 << 20); + auto cuda_pinned_allocation = + instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20); + ASSERT_NE(cuda_pinned_allocation, nullptr); + ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr); + ASSERT_EQ(cuda_pinned_allocation->place(), place); + ASSERT_GE(cuda_pinned_allocation->size(), size); + } +#endif +} + +TEST(Allocator, Allocator) { +#ifdef PADDLE_WITH_CUDA + FLAGS_fraction_of_gpu_memory_to_use = 0.01; + FLAGS_gpu_allocator_retry_time = 500; + FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; +#endif + + AllocateTestCases(); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc index b46b1e9ae206b82f5810b4ba7345ebc60fb84285..fff94c01e709613603eea7150a08df3c2611dec2 100644 --- a/paddle/fluid/memory/allocation/allocator_strategy.cc +++ b/paddle/fluid/memory/allocation/allocator_strategy.cc @@ -14,20 +14,27 @@ #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "gflags/gflags.h" +#include "paddle/fluid/platform/enforce.h" DEFINE_string( allocator_strategy, "legacy", "The allocation strategy. Legacy means the original allocator of Fluid." - "New means the experimental allocators of Fluid. in [legacy, new]"); + "naive_best_fit means the experimental best fit allocator. " + "allocator. Enum in [legacy, naive_best_fit]."); namespace paddle { namespace memory { namespace allocation { static AllocatorStrategy GetStrategyFromFlag() { - return FLAGS_allocator_strategy == "legacy" - ? AllocatorStrategy::kLegacy - : AllocatorStrategy::kNaiveBestFit; + if (FLAGS_allocator_strategy == "legacy") { + return AllocatorStrategy::kLegacy; + } else if (FLAGS_allocator_strategy == "naive_best_fit") { + return AllocatorStrategy::kNaiveBestFit; + } else { + PADDLE_THROW("Unsupported allocator strategy: %s", + FLAGS_allocator_strategy); + } } AllocatorStrategy GetAllocatorStrategy() { diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc index e3d6c2f511ef083ef9ecc1fe8df96051b2b85cc2..d87dd9a4b6df288065389a335a9ddb4047dd096a 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc @@ -109,7 +109,7 @@ size_t BestFitAllocator::NumFreeChunks() const { } return num; } -void BestFitAllocator::Free(Allocation* allocation) { +void BestFitAllocator::FreeImpl(Allocation* allocation) { auto* bf_allocation = dynamic_cast(allocation); PADDLE_ENFORCE_NOT_NULL(bf_allocation, "The input allocation is not BestFitAllocation."); diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h index 4f10f2b53e8543d4197097f1cae8de765bceeb0f..c137438c0c35a575d366a1dfdf950262f711defa 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/best_fit_allocator.h @@ -119,7 +119,7 @@ class BestFitAllocator : public Allocator { void InsertFreeNode(const ListIt& it); protected: - void Free(Allocation* allocation) override; + void FreeImpl(Allocation* allocation) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; private: diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc index fc75abc9dfee6c9df5bc87faa493002cc1fe6298..e04c0aa34b1cd6200806cc2a012161e3478eca0b 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator.cc +++ b/paddle/fluid/memory/allocation/buffered_allocator.cc @@ -22,11 +22,11 @@ namespace paddle { namespace memory { namespace allocation { -BufferedAllocator::BufferedAllocator(std::unique_ptr &&allocator) +BufferedAllocator::BufferedAllocator(std::shared_ptr allocator) : underlying_allocator_(std::move(allocator)) { PADDLE_ENFORCE_NOT_NULL( underlying_allocator_, - "Underlying allocator of BufferedAllocator must be unmanaged"); + "Underlying allocator of BufferedAllocator must not be null"); if (underlying_allocator_->IsAllocThreadSafe()) { mtx_.reset(new std::mutex()); } @@ -41,19 +41,19 @@ void BufferedAllocator::FreeCache(size_t size) { while (!allocations_.empty()) { // free the largest auto it = --allocations_.end(); cur += it->second->size(); - delete it->second.release(); + underlying_allocator_->Free(it->second.release()); allocations_.erase(it); if (cur >= size) return; } } -bool BufferedAllocator::IsAllocThreadSafe() const { - return this->underlying_allocator_->IsAllocThreadSafe(); -} -void BufferedAllocator::Free(Allocation *allocation) { +bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; } + +void BufferedAllocator::FreeImpl(Allocation *allocation) { platform::LockGuardPtr guard(mtx_); allocations_.emplace(allocation->size(), AllocationPtr(allocation)); } + Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { { platform::LockGuardPtr guard(mtx_); @@ -61,17 +61,15 @@ Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { if (it != allocations_.end() && it->first < size * 2) { AllocationPtr result(std::move(it->second)); allocations_.erase(it); - return new AllocationWithUnderlying(std::move(result)); + return result.release(); } } try { - return new AllocationWithUnderlying( - underlying_allocator_->Allocate(size, attr)); + return underlying_allocator_->Allocate(size, attr).release(); } catch (BadAlloc &) { FreeCache(size); - return new AllocationWithUnderlying( - underlying_allocator_->Allocate(size, attr)); + return underlying_allocator_->Allocate(size, attr).release(); } } diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h index d44a3f85beba712b1e735ba14008689bce7d0d64..c728395705842d29a7b2a8441a7048a7e4bf5e6b 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator.h +++ b/paddle/fluid/memory/allocation/buffered_allocator.h @@ -31,7 +31,7 @@ namespace allocation { // underlying_allocator_ class BufferedAllocator : public Allocator { public: - explicit BufferedAllocator(std::unique_ptr &&allocator); + explicit BufferedAllocator(std::shared_ptr allocator); ~BufferedAllocator(); @@ -44,11 +44,11 @@ class BufferedAllocator : public Allocator { void FreeCache(size_t size); protected: - void Free(Allocation *allocation) override; + void FreeImpl(Allocation *allocation) override; Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; private: - std::unique_ptr underlying_allocator_; + std::shared_ptr underlying_allocator_; std::multimap allocations_; std::unique_ptr mtx_; }; diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc index 41ebb9dbeaf36eafe3dff4ae294b84427f660cbf..854a117b0e7532962d5e0c95fd947527ac3b307a 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc +++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/memory/allocation/buffered_allocator.h" #include +#include #include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/locked_allocator.h" @@ -64,7 +65,7 @@ class StubAllocator : public Allocator { size_t GetFreeCount() const { return destruct_count_; } protected: - void Free(Allocation *allocation) override { + void FreeImpl(Allocation *allocation) override { auto *alloc = dynamic_cast(allocation); PADDLE_ENFORCE_NOT_NULL(alloc); if (alloc->ptr()) delete[] static_cast(alloc->ptr()); diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc index cc81a6f7b8b1950b07b6fb1571b53d9b5ddb1b9f..90c49c87a677aa38bce35774b3a7bb698e6f43e7 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.cc +++ b/paddle/fluid/memory/allocation/cpu_allocator.cc @@ -20,25 +20,27 @@ namespace paddle { namespace memory { namespace allocation { -CPUAllocation::CPUAllocation(void *ptr, size_t size) - : Allocation(ptr, size, platform::CPUPlace()) {} - bool CPUAllocator::IsAllocThreadSafe() const { return true; } -void CPUAllocator::Free(Allocation *allocation) { - PADDLE_ENFORCE_NOT_NULL(dynamic_cast(allocation)); - free(allocation->ptr()); +void CPUAllocator::FreeImpl(Allocation *allocation) { + void *p = allocation->ptr(); +#ifdef _WIN32 + _aligned_free(p); +#else + free(p); +#endif delete allocation; } Allocation *CPUAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { - void *ptr; - auto status = posix_memalign(&ptr, kAlignment, size); - if (UNLIKELY(status) != 0) { - throw BadAlloc(string::Sprintf("Cannot allocate cpu memory %d. Errno is %d", - size, status)); - } - return new CPUAllocation(ptr, size); + void *p; +#ifdef _WIN32 + p = _aligned_malloc(size, kAlignment); +#else + PADDLE_ENFORCE_EQ(posix_memalign(&p, kAlignment, size), 0, "Alloc %ld error!", + size); +#endif + return new Allocation(p, size, platform::CPUPlace()); } } // namespace allocation } // namespace memory diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h index 26d3643f4edff1f2d71b1c761e915a6dacb485ad..3eb1416b0efa9327f2052e1f128359bc93f94986 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.h +++ b/paddle/fluid/memory/allocation/cpu_allocator.h @@ -31,19 +31,13 @@ namespace allocation { // // NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import // an open-sourced allocator into Paddle. -class CPUAllocator; -class CPUAllocation : public Allocation { - public: - CPUAllocation(void* ptr, size_t size); -}; - class CPUAllocator : public Allocator { public: - constexpr static size_t kAlignment = 64u; + constexpr static size_t kAlignment = 4096UL; bool IsAllocThreadSafe() const override; protected: - void Free(Allocation* allocation) override; + void FreeImpl(Allocation* allocation) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index 430bf0be98e08787ac4412a8b6e0fcc310ffe2b4..895a24a6a2a6b8e399ec2ace48136d1ef16c62f6 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -23,15 +23,14 @@ namespace paddle { namespace memory { namespace allocation { bool CUDAAllocator::IsAllocThreadSafe() const { return true; } -void CUDAAllocator::Free(Allocation* allocation) { +void CUDAAllocator::FreeImpl(Allocation* allocation) { platform::CUDADeviceGuard guard(place_.device); - auto* cuda_allocation = dynamic_cast(allocation); - PADDLE_ENFORCE_NOT_NULL(cuda_allocation); - PADDLE_ENFORCE_EQ(boost::get(cuda_allocation->place()), + PADDLE_ENFORCE_EQ(boost::get(allocation->place()), place_); PADDLE_ENFORCE(cudaFree(allocation->ptr())); delete allocation; } + Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { platform::CUDADeviceGuard guard(place_.device); void* ptr; @@ -41,8 +40,9 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device, status, cudaGetErrorString(status))); } - return new CUDAAllocation(ptr, size, platform::Place(place_)); + return new Allocation(ptr, size, platform::Place(place_)); } + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h index 63726f5820b1c81565117c7a9bf798c17c9681f6..580a2d1df1d5997a27180740393741ec8973bf18 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_allocator.h @@ -20,13 +20,6 @@ namespace paddle { namespace memory { namespace allocation { -// CUDA System allocator and allocation. -// Just a flag type. -class CUDAAllocation : public Allocation { - public: - using Allocation::Allocation; -}; - class CUDAAllocator : public Allocator { public: explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {} @@ -35,7 +28,7 @@ class CUDAAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - void Free(Allocation* allocation) override; + void FreeImpl(Allocation* allocation) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; private: diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 1936f9d4cd83c53cf7b322ab29a3e0d92e042abc..0dc2de37467b7e7d23c88b4a255c14795db4c275 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/memory/allocation/legacy_allocator.h" - +#include #include #include #include @@ -23,9 +22,11 @@ #endif #include "glog/logging.h" +#include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/split.h" @@ -36,6 +37,8 @@ DEFINE_bool(init_allocated_mem, false, "that initializing the allocated memory with a small value " "during unit testing."); DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_bool(benchmark); namespace paddle { @@ -131,40 +134,48 @@ size_t Used(const platform::CPUPlace &place) { } #ifdef PADDLE_WITH_CUDA -BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { - static std::once_flag init_flag; - static detail::BuddyAllocator **a_arr = nullptr; - static std::vector devices; - - std::call_once(init_flag, [gpu_id]() { - devices = platform::GetSelectedDevices(); - int gpu_num = devices.size(); - - allocation::GPUMemMonitor.Initialize(devices.size()); +class GPUBuddyAllocatorList { + public: + GPUBuddyAllocatorList() + : allocators_(platform::GetCUDADeviceCount()), + flags_(platform::GetCUDADeviceCount()) { + allocation::GPUMemMonitor.Initialize(allocators_.size()); + } - a_arr = new BuddyAllocator *[gpu_num]; - for (size_t i = 0; i < devices.size(); ++i) { - int dev_id = devices[i]; - a_arr[i] = nullptr; + BuddyAllocator *Get(size_t dev_id) { + PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id); + std::call_once(flags_[dev_id], [this, dev_id] { platform::SetDeviceId(dev_id); - a_arr[i] = new BuddyAllocator(std::unique_ptr( - new detail::GPUAllocator(dev_id)), - platform::GpuMinChunkSize(), - platform::GpuMaxChunkSize()); - - VLOG(10) << "\n\nNOTE: each GPU device use " - << FLAGS_fraction_of_gpu_memory_to_use * 100 - << "% of GPU memory.\n" - << "You can set GFlags environment variable '" - << "FLAGS_fraction_of_gpu_memory_to_use" - << "' to change the fraction of GPU usage.\n\n"; - } - }); + allocators_[dev_id] = new BuddyAllocator( + std::unique_ptr( + new detail::GPUAllocator(dev_id)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + VLOG(10) << "\n\nNOTE:\n" + << "You can set GFlags environment variable " + << "'FLAGS_fraction_of_gpu_memory_to_use' " + << "or 'FLAGS_initial_gpu_memory_in_mb' " + << "or 'FLAGS_reallocate_gpu_memory_in_mb' " + << "to change the memory size for GPU usage.\n" + << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is " + << FLAGS_fraction_of_gpu_memory_to_use + << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is " + << FLAGS_initial_gpu_memory_in_mb + << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is " + << FLAGS_reallocate_gpu_memory_in_mb << "\n\n"; + }); + return allocators_[dev_id]; + } + + private: + std::vector allocators_; + std::vector flags_; +}; +BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { + static GPUBuddyAllocatorList allocators; platform::SetDeviceId(gpu_id); - auto pos = std::distance(devices.begin(), - std::find(devices.begin(), devices.end(), gpu_id)); - return a_arr[pos]; + return allocators.Get(gpu_id); } #endif @@ -183,7 +194,7 @@ void *Alloc(const platform::CUDAPlace &place, #ifdef PADDLE_WITH_CUDA auto *buddy_allocator = GetGPUBuddyAllocator(place.device); auto *ptr = buddy_allocator->Alloc(size); - if (ptr == nullptr) { + if (ptr == nullptr && size > 0) { int cur_dev = platform::GetCurrentDeviceId(); platform::SetDeviceId(place.device); size_t avail, total; @@ -328,18 +339,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { } // namespace legacy namespace allocation { - LegacyMemMonitor GPUMemMonitor; Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_); - return new Allocation(ptr, size, place_); + auto *tmp_alloc = new Allocation(ptr, size, place_); + platform::MemEvenRecorder::Instance().PushMemRecord( + static_cast(tmp_alloc), place_, size); + return tmp_alloc; } -void LegacyAllocator::Free(Allocation *allocation) { +void LegacyAllocator::FreeImpl(Allocation *allocation) { boost::apply_visitor( legacy::FreeVisitor(allocation->ptr(), allocation->size()), allocation->place()); + platform::MemEvenRecorder::Instance().PopMemRecord( + static_cast(allocation), place_); delete allocation; } diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h index d9bdae153da6439598f76f5cac226897e6e0c596..27cd42ea35012f07ae7db79c46d767138ddaafff 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.h +++ b/paddle/fluid/memory/allocation/legacy_allocator.h @@ -73,7 +73,7 @@ class LegacyAllocator : public Allocator { protected: Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; - void Free(Allocation *allocation) override; + void FreeImpl(Allocation *allocation) override; private: platform::Place place_; diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc index 835f6527c8a1d83340167bd9079f7cee25ad24cf..c43099cc88f839ad92d36774d49aafd7192f916f 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.cc +++ b/paddle/fluid/memory/allocation/locked_allocator.cc @@ -14,8 +14,10 @@ #include "paddle/fluid/memory/allocation/locked_allocator.h" #include // NOLINT +#include #include "paddle/fluid/memory/allocation/allocation_with_underlying.h" #include "paddle/fluid/platform/lock_guard_ptr.h" + namespace paddle { namespace memory { namespace allocation { @@ -23,26 +25,24 @@ namespace allocation { bool LockedAllocator::IsAllocThreadSafe() const { return true; } LockedAllocator::LockedAllocator( - std::unique_ptr &&underlying_allocator) + std::shared_ptr underlying_allocator) : underlying_allocator_(std::move(underlying_allocator)) { PADDLE_ENFORCE_NOT_NULL(underlying_allocator_); if (!underlying_allocator_->IsAllocThreadSafe()) { mtx_.reset(new std::mutex()); } } -void LockedAllocator::Free(Allocation *allocation) { - { - platform::LockGuardPtr guard(mtx_); - reinterpret_cast(allocation) - ->allocation_.reset(); // Destroy inner allocation - } - delete allocation; + +void LockedAllocator::FreeImpl(Allocation *allocation) { + platform::LockGuardPtr guard(mtx_); + underlying_allocator_->Free(allocation); } + Allocation *LockedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { platform::LockGuardPtr guard(mtx_); - return new AllocationWithUnderlying( - underlying_allocator_->Allocate(size, attr)); + return underlying_allocator_->Allocate(size, attr).release(); } + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h index 4967b9bb8d3ad101cff4657b0a45b49b76e2deb2..b735ccef101417b3f880eb6dcdd9964cffbe875c 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.h +++ b/paddle/fluid/memory/allocation/locked_allocator.h @@ -24,15 +24,15 @@ namespace allocation { // A allocator to make underlying allocator thread safe. class LockedAllocator : public Allocator { public: - explicit LockedAllocator(std::unique_ptr &&underlying_allocator); + explicit LockedAllocator(std::shared_ptr underlying_allocator); bool IsAllocThreadSafe() const override; protected: - void Free(Allocation *allocation) override; + void FreeImpl(Allocation *allocation) override; Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; private: - std::unique_ptr underlying_allocator_; + std::shared_ptr underlying_allocator_; std::unique_ptr mtx_; }; diff --git a/paddle/fluid/memory/allocation/allocator_facade_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc similarity index 96% rename from paddle/fluid/memory/allocation/allocator_facade_test.cc rename to paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc index 802d79e15de253d4e67e35046bdf1d689258da6d..3334589a4beb407447cf89c173f6128654bb245a 100644 --- a/paddle/fluid/memory/allocation/allocator_facade_test.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/memory/allocation/allocator_facade.h" #include #include +#include "paddle/fluid/memory/allocation/allocator_facade.h" #ifdef PADDLE_WITH_CUDA DECLARE_double(fraction_of_gpu_memory_to_use); @@ -22,6 +22,8 @@ DECLARE_double(fraction_of_cuda_pinned_memory_to_use); DECLARE_int64(gpu_allocator_retry_time); #endif +DECLARE_string(allocator_strategy); + namespace paddle { namespace memory { namespace allocation { @@ -33,6 +35,8 @@ TEST(allocator, allocator) { FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; #endif + FLAGS_allocator_strategy = "naive_best_fit"; + auto &instance = AllocatorFacade::Instance(); platform::Place place; size_t size = 1024; diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index de81d12cca6ca280289371abdec225c9e2b8f4d0..5a3d817211750d3e19e65344d1eab5a96800c674 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -20,20 +20,15 @@ namespace paddle { namespace memory { namespace allocation { bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; } -void CPUPinnedAllocator::Free(Allocation *allocation) { - PADDLE_ENFORCE_NOT_NULL(dynamic_cast(allocation)); +void CPUPinnedAllocator::FreeImpl(Allocation *allocation) { PADDLE_ENFORCE(cudaFreeHost(allocation->ptr())); delete allocation; } Allocation *CPUPinnedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { - // PADDLE_ENFORCE_EQ( - // attr, kCrossDevice, - // "CPUPinnedAllocator should be used for Cross-Device Communication"); - void *ptr; PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); - return new CPUPinnedAllocation(ptr, size); + return new Allocation(ptr, size, platform::CUDAPinnedPlace()); } } // namespace allocation } // namespace memory diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h index 42d0938f2afbb1efca8bfdd7035bc0eada30f06b..deeb55a8fb0396a312286f5c2692114e9e4afc8d 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.h +++ b/paddle/fluid/memory/allocation/pinned_allocator.h @@ -20,18 +20,12 @@ namespace memory { namespace allocation { // Allocator uses `cudaHostAlloc` -class CPUPinnedAllocation : public Allocation { - public: - CPUPinnedAllocation(void *ptr, size_t size) - : Allocation(ptr, size, platform::CUDAPinnedPlace()) {} -}; - class CPUPinnedAllocator : public Allocator { public: bool IsAllocThreadSafe() const override; protected: - void Free(Allocation *allocation) override; + void FreeImpl(Allocation *allocation) override; Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; }; diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc index 981705051b449e6a35c2dcce9138dc2efae52920..7e888988f9602e362d73f64c1b45552e84e3349c 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.cc +++ b/paddle/fluid/memory/allocation/retry_allocator.cc @@ -18,25 +18,15 @@ namespace paddle { namespace memory { namespace allocation { -bool RetryAllocator::IsAllocThreadSafe() const { - return underlying_allocator_->IsAllocThreadSafe(); -} - -void RetryAllocator::Free(Allocation* allocation) { +void RetryAllocator::FreeImpl(Allocation* allocation) { // Delete underlying allocation first. - reinterpret_cast(allocation)->allocation_.reset(); - { - // notify all waited allocators, they can try to allocate memory after free. - std::lock_guard lock(mutex_); - cv_.notify_all(); - } - delete allocation; + underlying_allocator_->Free(allocation); + cv_.notify_all(); } Allocation* RetryAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { auto alloc_func = [&, this]() { - return new AllocationWithUnderlying( - underlying_allocator_->Allocate(size, attr)); + return underlying_allocator_->Allocate(size, attr).release(); }; // In fact, we can unify the code of allocation success and failure // But it would add lock even when allocation success at the first time diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h index 5efcac8b108002a2a2da920173d237096de4fffa..379f576d6e1ed8f256a0233b203423a487ee73e4 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.h +++ b/paddle/fluid/memory/allocation/retry_allocator.h @@ -18,38 +18,32 @@ #include // NOLINT #include #include // NOLINT +#include #include "paddle/fluid/memory/allocation/allocator.h" namespace paddle { namespace memory { namespace allocation { -class RetryAllocator; - class RetryAllocator : public Allocator { public: - RetryAllocator(std::unique_ptr&& allocator, size_t retry_ms) + RetryAllocator(std::shared_ptr allocator, size_t retry_ms) : underlying_allocator_(std::move(allocator)), retry_time_(retry_ms) { - EnforceCheck(); - } - - bool IsAllocThreadSafe() const override; - - private: - void EnforceCheck() { PADDLE_ENFORCE_NOT_NULL( - underlying_allocator_.get(), - "UnderlyingAllocator of RetryAllocator must be UnmanagedAllocator"); + underlying_allocator_, + "UnderlyingAllocator of RetryAllocator must not be null"); PADDLE_ENFORCE(underlying_allocator_->IsAllocThreadSafe(), "UnderlyingAllocator of RetryAllocator must be thread-safe"); } + bool IsAllocThreadSafe() const override { return true; } + protected: - void Free(Allocation* allocation) override; + void FreeImpl(Allocation* allocation) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; private: - std::unique_ptr underlying_allocator_; + std::shared_ptr underlying_allocator_; std::chrono::milliseconds retry_time_; std::mutex mutex_; std::condition_variable cv_; @@ -57,8 +51,6 @@ class RetryAllocator : public Allocator { // For debug, We can add an atomic integer to record how many memory sizes are // waited to allocate // std::atomic waited_allocate_size_{0}; - - friend class RetryAllocation; }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.cc b/paddle/fluid/memory/allocation/zero_size_allocator.cc index cb2df1a029815478bbc9d3b09425f3ef145c5fb3..39743bcb10c700c9a8446b9040c8a8707d57ec7d 100644 --- a/paddle/fluid/memory/allocation/zero_size_allocator.cc +++ b/paddle/fluid/memory/allocation/zero_size_allocator.cc @@ -24,11 +24,20 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const { Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { if (size == 0) { - return new ZeroSizeAllocation(place_); + return new Allocation(nullptr, 0, place_); } else { return underlying_allocator_->Allocate(size, attr).release(); } } + +void ZeroSizeAllocator::FreeImpl(Allocation *allocation) { + if (allocation->size() == 0) { + delete allocation; + } else { + underlying_allocator_->Free(allocation); + } +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.h b/paddle/fluid/memory/allocation/zero_size_allocator.h index 6b80245a34e7a6834aa75a90218845cc92036881..08a7a06dbf290b55994a407fe478f792b0c0964a 100644 --- a/paddle/fluid/memory/allocation/zero_size_allocator.h +++ b/paddle/fluid/memory/allocation/zero_size_allocator.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include "paddle/fluid/memory/allocation/allocator.h" @@ -23,12 +24,6 @@ namespace allocation { // The allocator handles the request's size is zero. Allocator will always // return an allocation even the request size is zero. However, the // allocation.ptr() is nullptr -class ZeroSizeAllocation : public Allocation { - public: - explicit ZeroSizeAllocation(const platform::Place& p) - : Allocation(nullptr, 0, p) {} -}; - class ZeroSizeAllocator : public Allocator { public: ZeroSizeAllocator(std::shared_ptr underlying_allocator, @@ -39,6 +34,7 @@ class ZeroSizeAllocator : public Allocator { protected: Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; + void FreeImpl(Allocation* allocation) override; private: std::shared_ptr underlying_allocator_; diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt index c725dba5e98c200c2542d97cb8f53a938f6b614a..a555b6b299228720c7559e610f4d6f31167e1555 100644 --- a/paddle/fluid/memory/detail/CMakeLists.txt +++ b/paddle/fluid/memory/detail/CMakeLists.txt @@ -9,3 +9,5 @@ endif(${WITH_GPU}) cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator) cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog) + +cc_test(buddy_allocator_test SRCS buddy_allocator_test.cc DEPS buddy_allocator) diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 26ef27c3caafadb4801b0ae52133f6175655ce0a..edd6ea4adec2e080d294fdb207d8dd4880fdcf79 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -13,6 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/memory/detail/buddy_allocator.h" + +#include +#include + #include "glog/logging.h" DEFINE_bool(free_idle_memory, false, @@ -36,9 +40,10 @@ BuddyAllocator::~BuddyAllocator() { "have actually been freed"; while (!pool_.empty()) { auto block = static_cast(std::get<2>(*pool_.begin())); - VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")"; + VLOG(10) << "Free from block (" << block << ", " << block->size(cache_) + << ")"; - system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + system_allocator_->Free(block, block->size(cache_), block->index(cache_)); cache_.invalidate(block); pool_.erase(pool_.begin()); } @@ -71,7 +76,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { // refill the pool if failure if (it == pool_.end()) { - it = RefillPool(); + it = RefillPool(size); // if still failure, fail fatally if (it == pool_.end()) { return nullptr; @@ -184,19 +189,28 @@ void* BuddyAllocator::SystemAlloc(size_t size) { return static_cast(p)->data(); } -BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { +BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( + size_t request_bytes) { + size_t allocate_bytes = max_chunk_size_; + size_t index = 0; + #ifdef PADDLE_WITH_CUDA if (system_allocator_->UseGpu()) { if ((total_used_ + total_free_) == 0) { - // Compute the maximum allocation size for the first allocation. - max_chunk_size_ = platform::GpuMaxChunkSize(); + // Compute the allocation size for gpu for the first allocation. + allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes); + } else { + // Reallocation size + if (realloc_size_ == 0) { + realloc_size_ = platform::GpuReallocSize(); + } + allocate_bytes = std::max(realloc_size_, request_bytes); } } #endif - // Allocate a new maximum sized block - size_t index = 0; - void* p = system_allocator_->Alloc(&index, max_chunk_size_); + // Allocate a new block + void* p = system_allocator_->Alloc(&index, allocate_bytes); if (p == nullptr) return pool_.end(); @@ -204,7 +218,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { << " from system allocator"; static_cast(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index, - max_chunk_size_, nullptr, nullptr); + allocate_bytes, nullptr, nullptr); // gpu fallback allocation if (system_allocator_->UseGpu() && @@ -212,10 +226,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { fallback_alloc_count_++; } - total_free_ += max_chunk_size_; + total_free_ += allocate_bytes; // dump the block into pool - return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first; + return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first; } BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) { @@ -286,12 +300,12 @@ void BuddyAllocator::CleanIdleFallBackAlloc() { VLOG(10) << "Return block " << block << " to fallback allocator."; - system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + system_allocator_->Free(block, block->size(cache_), block->index(cache_)); cache_.invalidate(block); pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); - total_free_ -= max_chunk_size_; + total_free_ -= block->size(cache_); fallback_alloc_count_--; // If no fall allocation exists, return directly @@ -322,12 +336,12 @@ void BuddyAllocator::CleanIdleNormalAlloc() { VLOG(10) << "Return block " << block << " to base allocator."; - system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + system_allocator_->Free(block, block->size(cache_), block->index(cache_)); cache_.invalidate(block); pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); - total_free_ -= max_chunk_size_; + total_free_ -= block->size(cache_); if (!shall_free_alloc()) return; } diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index 3f86a51f0d0b8504bbc4b0477f123093b343e9cf..bdc8cca4b55e6fe67618fb13cd8bf40c2c24858b 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -60,7 +60,7 @@ class BuddyAllocator { void* SystemAlloc(size_t size); /*! \brief If existing chunks are not suitable, refill pool */ - PoolSet::iterator RefillPool(); + PoolSet::iterator RefillPool(size_t request_bytes); /** * \brief Find the suitable chunk from existing pool and split @@ -89,6 +89,8 @@ class BuddyAllocator { size_t min_chunk_size_; // the minimum size of each chunk size_t max_chunk_size_; // the maximum size of each chunk + size_t realloc_size_ = 0; // the size of re-allocated chunk + private: /** * \brief A list of free allocation diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..1edc9f2034c87d4dbd655135c557bdb86ec4354d --- /dev/null +++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc @@ -0,0 +1,133 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/buddy_allocator.h" + +#include + +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/gpu_info.h" + +#ifdef PADDLE_WITH_CUDA +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); +#endif + +namespace paddle { +namespace memory { +namespace detail { + +constexpr static int test_gpu_id = 0; + +void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) { + bool freed = false; + size_t used_bytes = allocator->Used(); + + if (size_bytes > 0) { + void* p = allocator->Alloc(size_bytes); + + EXPECT_NE(p, nullptr); +#ifdef PADDLE_WITH_CUDA + if (size_bytes < platform::GpuMaxChunkSize()) { +#else + if (size_bytes < platform::CpuMaxChunkSize()) { +#endif + // Not allocate from SystemAllocator + EXPECT_GE(allocator->Used(), used_bytes + size_bytes); + } else { + // Allocate from SystemAllocator doesn't count in Used() + EXPECT_EQ(allocator->Used(), used_bytes); + } + + int* intp = static_cast(p); + std::shared_ptr ptr(intp, [&](void* p) { + allocator->Free(intp); + freed = true; + }); + } else { + freed = true; + } + + EXPECT_EQ(used_bytes, allocator->Used()); + EXPECT_TRUE(freed); +} + +#ifdef PADDLE_WITH_CUDA +TEST(BuddyAllocator, GpuFraction) { + FLAGS_fraction_of_gpu_memory_to_use = 0.01; + + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(test_gpu_id)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + TestBuddyAllocator(&buddy_allocator, 10); + TestBuddyAllocator(&buddy_allocator, 10 << 10); + TestBuddyAllocator(&buddy_allocator, 10 << 20); + TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30)); +} + +TEST(BuddyAllocator, InitRealloc) { + FLAGS_initial_gpu_memory_in_mb = 100; + FLAGS_reallocate_gpu_memory_in_mb = 50; + + EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast(100 << 20)); + + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(test_gpu_id)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + // Less then initial size and reallocate size + TestBuddyAllocator(&buddy_allocator, 10 << 20); + // Between initial size and reallocate size and not exceed pool + TestBuddyAllocator(&buddy_allocator, 80 << 20); + // Less then reallocate size and exceed pool + TestBuddyAllocator(&buddy_allocator, 40 << 20); + // Greater then reallocate size and exceed pool + TestBuddyAllocator(&buddy_allocator, 80 << 20); + // Greater then initial size and reallocate size + TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30)); +} + +TEST(BuddyAllocator, ReallocSizeGreaterThanInit) { + FLAGS_initial_gpu_memory_in_mb = 5; + FLAGS_reallocate_gpu_memory_in_mb = 10; + + EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast(10 << 20)); + + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(test_gpu_id)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + // Less then initial size and reallocate size + TestBuddyAllocator(&buddy_allocator, 1 << 20); + // Between initial size and reallocate size and not exceed pool + TestBuddyAllocator(&buddy_allocator, 3 << 20); + // Less then initial size and exceed pool + TestBuddyAllocator(&buddy_allocator, 3 << 20); + // Less then reallocate size and not exceed pool (now pool is 15 MB, used 7 + // MB) + TestBuddyAllocator(&buddy_allocator, 7 << 20); + // Less then reallocate size and exceed pool + TestBuddyAllocator(&buddy_allocator, 8 << 20); + // Greater then initial size and reallocate size + TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30)); +} +#endif + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 197d1c2f21fd818879aafe17599bc87d33caa198..41d79c5beb1367907a401b572d3d0eaf3a8ac67b 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -32,6 +32,9 @@ limitations under the License. */ DECLARE_bool(use_pinned_memory); DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); + namespace paddle { namespace memory { namespace detail { @@ -119,11 +122,18 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { gpu_alloc_size_ += size; return p; } else { - LOG(WARNING) - << "Cannot malloc " << size / 1024.0 / 1024.0 - << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use " - "environment variable to a lower value. Current value is " - << FLAGS_fraction_of_gpu_memory_to_use; + LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0 + << " MB GPU memory. Please shrink " + "FLAGS_fraction_of_gpu_memory_to_use or " + "FLAGS_initial_gpu_memory_in_mb or " + "FLAGS_reallocate_gpu_memory_in_mb" + "environment variable to a lower value. " + << "Current FLAGS_fraction_of_gpu_memory_to_use value is " + << FLAGS_fraction_of_gpu_memory_to_use + << ". Current FLAGS_initial_gpu_memory_in_mb value is " + << FLAGS_initial_gpu_memory_in_mb + << ". Current FLAGS_reallocate_gpu_memory_in_mb value is " + << FLAGS_reallocate_gpu_memory_in_mb; return nullptr; } } diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 2a6f70a01e303aa1b608248cbeb8dcfa24837a0c..1408163e4b5278ddcd65eb4f2900109d772a589a 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" #include // for memcpy +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace memory { @@ -29,14 +30,23 @@ void Copy(platform::CPUPlace, void* dst, #ifdef PADDLE_WITH_CUDA static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K +// NOTE(zcd): Do not use GpuMemcpySync as much as possible. +// because GpuMemcpySync issues the copying command to the default stream, +// which will make two commands from different streams cannot run concurrently. +// Reference: +// https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/ + template <> void Copy( platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, const void* src, size_t num, cudaStream_t stream) { platform::SetDeviceId(src_place.device); + if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); } else { + platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU"); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); // FIXME(zjl): do we really need it? if (num <= kMaxGpuAsyncCopyBytes) { @@ -51,8 +61,10 @@ void Copy( const void* src, size_t num, cudaStream_t stream) { platform::SetDeviceId(dst_place.device); if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); } else { + platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU"); platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); // FIXME(zjl): do we really need it? if (num <= kMaxGpuAsyncCopyBytes) { @@ -68,15 +80,19 @@ void Copy( if (dst_place == src_place) { platform::SetDeviceId(src_place.device); if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); } else { + platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU"); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice); } } else { if (stream) { + platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU"); platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, num, stream); } else { + platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU"); platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device, num); } @@ -111,8 +127,10 @@ void Copy( cudaStream_t stream) { platform::SetDeviceId(src_place.device); if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); } else { + platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned"); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); } } @@ -124,8 +142,10 @@ void Copy( cudaStream_t stream) { platform::SetDeviceId(dst_place.device); if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU"); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); } else { + platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU"); platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); } } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index a3f2a69aef52b6f55aa09e6dee2c22c048626c0d..e52e83673fe1c9ad2426e45f233c5e62f5c2f06e 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -34,6 +34,10 @@ if (WITH_GPU AND TENSORRT_FOUND) add_subdirectory(tensorrt) endif() +if (ANAKIN_FOUND) + add_subdirectory(anakin) +endif() + SET(OP_HEADER_DEPS xxhash) if (WITH_GPU) SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub) @@ -44,10 +48,10 @@ if (WITH_DISTRIBUTE) SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch) endif() -register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) +register_operators(EXCLUDES py_func_op warpctc_op dgc_op conv_fusion_op sync_batch_norm_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) -# warpctc_op needs cudnn 7 above if (WITH_GPU) + # warpctc_op needs cudnn 7 above if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) else() @@ -58,12 +62,22 @@ if (WITH_GPU) op_library(conv_fusion_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n") endif() + if (NOT WIN32) + op_library(sync_batch_norm_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n") + endif() else() op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) endif() set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) +if (WITH_GPU AND NOT WIN32) + op_library(dgc_op DEPS dgc) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(dgc);\n") + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dgc) +endif() + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 2feb8e4c4787440fd086c597fa2a7f97204e34ac..c87e4b22b37027efd1293e74f72598283946e62d 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" +#include #include +#include #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" #include "paddle/fluid/platform/port.h" #ifdef PADDLE_WITH_CUDA @@ -74,12 +76,16 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, const std::string& name) { framework::LibraryType library{framework::LibraryType::kPlain}; framework::DataLayout layout = framework::DataLayout::kAnyLayout; -#ifdef PADDLE_WITH_CUDA - auto it1 = oper.Attrs().find("use_cudnn"); - if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) { - library = framework::LibraryType::kCUDNN; - } -#endif +// FIXME(liuwei1031) temporarily disable the code to unblock users +// TODO(liuwei1031) figure out the reason behind +// https://github.com/PaddlePaddle/Paddle/issues/16096 +// and re-enable this in the future +// #ifdef PADDLE_WITH_CUDA +// auto it1 = oper.Attrs().find("use_cudnn"); +// if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) { +// library = framework::LibraryType::kCUDNN; +// } +// #endif #ifdef PADDLE_WITH_MKLDNN auto it = oper.Attrs().find("use_mkldnn"); if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() && @@ -186,6 +192,9 @@ $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ UNUSED constexpr char SqrtDoc[] = R"DOC( Sqrt Activation Operator. +Please make sure legal input, when input a negative value closed to zero, +you should add a small epsilon(1e-12) to avoid negative number caused by numerical errors. + $out = \sqrt{x}$ )DOC"; @@ -269,6 +278,48 @@ $$out = \\frac{x}{1 + \|x\|}$$ )DOC"; +class AcosOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of acos operator"); + AddOutput("Out", "Output of acos operator"); + AddComment(R"DOC( +Arccosine Activation Operator. + +$$out = \cos^{-1}(x)$$ + +)DOC"); + } +}; + +class AsinOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of asin operator"); + AddOutput("Out", "Output of asin operator"); + AddComment(R"DOC( +Arcsine Activation Operator. + +$$out = \sin^{-1}(x)$$ + +)DOC"); + } +}; + +class AtanOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of atan operator"); + AddOutput("Out", "Output of atan operator"); + AddComment(R"DOC( +Arctanh Activation Operator. + +$$out = \tanh^{-1}(x)$$ + +)DOC"); + } +}; + class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -543,7 +594,10 @@ namespace ops = paddle::operators; __macro(SoftShrink, softshrink); \ __macro(Abs, abs); \ __macro(Cos, cos); \ + __macro(Acos, acos); \ __macro(Sin, sin); \ + __macro(Asin, asin); \ + __macro(Atan, atan); \ __macro(Round, round); \ __macro(Log, log); \ __macro(Square, square); \ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 1f5ae7fb5cd2e1c14190602d2c35e6c3755cfd70..ff7e623f6f383ed2a8b8a40b3186d9c439ff1d86 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -39,9 +39,8 @@ namespace operators { Please refer to the layer_helper.py and get the details. */ static std::unordered_set InplaceOpSet = { - "sigmoid", "exp", "relu", "tanh", "sqrt", "ceil", - "floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid", -}; + "sigmoid", "exp", "relu", "tanh", "sqrt", "ceil", + "floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid"}; static bool IsInplace(const std::string& op) { bool inplace = InplaceOpSet.count(op); @@ -553,6 +552,101 @@ struct SinFunctor : public BaseActivationFunctor { } }; +template +struct Acos { + HOSTDEVICE T operator()(const T& val) const { return acos(val); } +}; + +template <> +struct Acos { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(acos(static_cast(val))); + } +}; + +// Acos(x) = acos(x) +template +struct AcosFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Acos()); + } +}; + +// acos'(x) = -1/sqrt(1-x^2) +template +struct AcosGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + -dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); + } +}; + +template +struct Asin { + HOSTDEVICE T operator()(const T& val) const { return asin(val); } +}; + +template <> +struct Asin { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(asin(static_cast(val))); + } +}; + +// Asin(x) = asin(x) +template +struct AsinFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Asin()); + } +}; + +// asin'(x) = 1/sqrt(1-x^2) +template +struct AsinGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); + } +}; + +template +struct Atan { + HOSTDEVICE T operator()(const T& val) const { return atan(val); } +}; + +template <> +struct Atan { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(atan(static_cast(val))); + } +}; + +// Atan(x) = atan(x) +template +struct AtanFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Atan()); + } +}; + +// atan'(x) = 1 / (1 + x^2) +template +struct AtanGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(1) / (static_cast(1) + x.square()); + } +}; + // round(x) = [x] template struct RoundFunctor : public BaseActivationFunctor { @@ -1001,13 +1095,16 @@ struct SwishGradFunctor : public BaseActivationFunctor { __macro(relu, ReluFunctor, ReluGradFunctor); \ __macro(gelu, GeluFunctor, GeluGradFunctor); \ __macro(tanh, TanhFunctor, TanhGradFunctor); \ + __macro(atan, AtanFunctor, AtanGradFunctor); \ __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ __macro(abs, AbsFunctor, AbsGradFunctor); \ __macro(ceil, CeilFunctor, ZeroGradFunctor); \ __macro(floor, FloorFunctor, ZeroGradFunctor); \ __macro(cos, CosFunctor, CosGradFunctor); \ + __macro(acos, AcosFunctor, AcosGradFunctor); \ __macro(sin, SinFunctor, SinGradFunctor); \ + __macro(asin, AsinFunctor, AsinGradFunctor); \ __macro(round, RoundFunctor, ZeroGradFunctor); \ __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(log, LogFunctor, LogGradFunctor); \ diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc index 8127e554bed1aae7a5ce8837bcadf1b7f13f1ac2..3882bbedaa0be0ba14bca9c4fcb626d5ecaab129 100644 --- a/paddle/fluid/operators/add_position_encoding_op.cc +++ b/paddle/fluid/operators/add_position_encoding_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/add_position_encoding_op.h" +#include namespace paddle { namespace operators { @@ -39,13 +40,8 @@ class AddPositionEncodingOpGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) must not be null."); - PADDLE_ENFORCE(ctx->HasInput("Out"), "Out must not be null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Out@GRAD must not be null."); - - auto out_dims = ctx->GetInputDim("Out"); if (ctx->HasOutput(framework::GradVarName("X"))) { + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); ctx->SetOutputDim(framework::GradVarName("X"), out_dims); } } @@ -75,6 +71,22 @@ class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker { } }; +class AddPositionEncodingGradOpDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("add_position_encoding_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle @@ -83,7 +95,7 @@ namespace plt = paddle::platform; REGISTER_OPERATOR(add_position_encoding, ops::AddPositionEncodingOp, ops::AddPositionEncodingOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::AddPositionEncodingGradOpDescMaker); REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc index 8944a749674c3ba6c83526e4d66f449075716f43..268a5b894a95df8e27730879473b457a31e18cd6 100644 --- a/paddle/fluid/operators/affine_channel_op.cc +++ b/paddle/fluid/operators/affine_channel_op.cc @@ -67,6 +67,22 @@ class AffineChannelOp : public framework::OperatorWithKernel { "Input(Bias) of AffineChannelOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of AffineChannelOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto scale_dims = ctx->GetInputDim("Scale"); + auto b_dims = ctx->GetInputDim("Bias"); + const framework::DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + + const int64_t C = (data_layout == framework::DataLayout::kNCHW + ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + PADDLE_ENFORCE_EQ(scale_dims.size(), 1UL); + PADDLE_ENFORCE_EQ(scale_dims[0], C); + PADDLE_ENFORCE_EQ(b_dims.size(), 1UL); + PADDLE_ENFORCE_EQ(b_dims[0], C); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->ShareLoD("X", "Out"); } @@ -97,6 +113,27 @@ class AffineChannelOpGrad : public framework::OperatorWithKernel { } }; +class AffineChannelGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("affine_channel_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("Scale", Input("Scale")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + + return std::unique_ptr(op); + } +}; + template using EigenArrayMap = Eigen::Map>; @@ -244,8 +281,7 @@ namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp, - ops::AffineChannelOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::AffineChannelOpMaker, ops::AffineChannelGradMaker); REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad); REGISTER_OP_CPU_KERNEL(affine_channel, ops::AffineChannelKernel, diff --git a/paddle/fluid/operators/anakin/CMakeLists.txt b/paddle/fluid/operators/anakin/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5eacefc645bab288da7c289a5d7701abbcbef03d --- /dev/null +++ b/paddle/fluid/operators/anakin/CMakeLists.txt @@ -0,0 +1,2 @@ +op_library(anakin_engine_op DEPS anakin_engine anakin_op_converter) +# file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(anakin_engine);\n") diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.cc b/paddle/fluid/operators/anakin/anakin_engine_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..58db16ea0c1347a366a4d5927e414d76864cb6ab --- /dev/null +++ b/paddle/fluid/operators/anakin/anakin_engine_op.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_CUDA + +#include +#include + +#include "paddle/fluid/operators/anakin/anakin_engine_op.h" + +namespace paddle { + +namespace operators { + +class AnakinEngineOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Xs", "A list of inputs.").AsDuplicable(); + AddOutput("Ys", "A list of outputs").AsDuplicable(); + AddAttr("subgraph", "the subgraph."); + AddAttr( + "engine_key", + "The engine_key here is used to distinguish different TRT Engines"); + AddAttr("sub_block", "the trt block"); + AddComment("Anakin engine operator."); + } +}; + +class AnakinEngineInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override {} +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(anakin_engine, ops::AnakinEngineOp, ops::AnakinEngineOpMaker, + ops::AnakinEngineOpMaker); + +#endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.h b/paddle/fluid/operators/anakin/anakin_engine_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9d5b4f6f54ccfc9802cef6abac428e28a72ac293 --- /dev/null +++ b/paddle/fluid/operators/anakin/anakin_engine_op.h @@ -0,0 +1,162 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_CUDA + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/engine.h" +#include "paddle/fluid/inference/analysis/helper.h" + +namespace paddle { +namespace operators { + +using FluidDT = framework::proto::VarType_Type; +using inference::Singleton; + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::Precision; +using anakin::saber::NV; +using anakin::saber::X86; +using anakin::saber::Shape; +using anakin::PBlock; +using anakin::PTuple; +using inference::anakin::AnakinEngine; + +class AnakinEngineOp : public framework::OperatorBase { + using AnakinNvEngineT = AnakinEngine; + + private: + std::vector input_names_; + std::unordered_set param_names_; + mutable AnakinNvEngineT *anakin_engine_; + std::string engine_key_; + std::string engine_serialized_data_; + + public: + AnakinEngineOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) { + input_names_ = Inputs("Xs"); + engine_key_ = Attr("engine_key"); + auto params = Attr>("parameters"); + for (const auto ¶m : params) { + param_names_.insert(param); + } + anakin_engine_ = nullptr; + } + + protected: + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + RunAnakin(scope, dev_place); + } + + void RunAnakin(const framework::Scope &scope, + const platform::Place &dev_place) const { + auto *engine = GetEngine(scope, dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + auto stream = + reinterpret_cast(dev_ctx).stream(); + + PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs"); + + std::vector output_maps = + Attr>("output_name_mapping"); + + std::map inputs; + // Convert input tensor from fluid to engine. + for (const auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; + auto &t = + inference::analysis::GetFromScope(scope, x); + + inputs.insert({x, &t}); + } + + std::map outputs; + int output_index = 0; + for (const auto &y : Outputs("Ys")) { + auto *fluid_v = scope.FindVar(y); + PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); + auto *fluid_t = fluid_v->GetMutable(); + outputs.insert({output_maps[output_index], fluid_t}); + output_index += 1; + } + engine->Execute(inputs, outputs, stream); + } + + AnakinNvEngineT *GetEngine(const framework::Scope &scope, + const platform::Place &dev_place) const { + if (anakin_engine_ == nullptr) { + anakin_engine_ = + inference::Singleton::Global() + .Get(engine_key_); + } + + return anakin_engine_; + } + + void Prepare(const framework::Scope &scope, const platform::Place &dev_place, + AnakinNvEngineT *engine) const { + LOG(INFO) << "Prepare Anakin engine (Optimize model structure, Select OP " + "kernel etc). This process may cost a lot of time."; + framework::proto::BlockDesc block_desc; + block_desc.ParseFromString(Attr("subgraph")); + + std::vector output_maps = + Attr>("output_name_mapping"); + + inference::Singleton::Global() + .ConvertBlock(block_desc, param_names_, scope, engine); + engine->Freeze(); + for (const auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; + auto &t = + inference::analysis::GetFromScope(scope, x); + auto t_shape = framework::vectorize2int(t.dims()); + // all input shape should be 4 dims + if (t_shape.size() == 2) { + t_shape.push_back(1); + t_shape.push_back(1); + } + engine->SetInputShape(x, t_shape); + } + + engine->Optimize(); + + engine->InitGraph(); + } +}; + +} // namespace operators +} // namespace paddle + +#endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index feac4125381bd897dac89943af44850012e4761d..494d26f58f23ad1e445bbe8d7f8ce1037e5aa598 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/batch_norm_op.h" +#include #include +#include #include "paddle/fluid/framework/data_layout.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -22,147 +24,150 @@ limitations under the License. */ namespace paddle { namespace operators { -class BatchNormOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), ""); - PADDLE_ENFORCE(ctx->HasInput("Scale"), ""); - PADDLE_ENFORCE(ctx->HasInput("Bias"), ""); - PADDLE_ENFORCE(ctx->HasInput("Mean"), ""); - PADDLE_ENFORCE(ctx->HasInput("Variance"), ""); - PADDLE_ENFORCE(ctx->HasOutput("Y"), ""); - PADDLE_ENFORCE(ctx->HasOutput("MeanOut"), ""); - PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"), ""); - PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), ""); - PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), ""); - - // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python - PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0], - "Mean and MeanOut should share the same memory"); - PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0], - ctx->Outputs("VarianceOut")[0], - "Variance and VarianceOut should share the same memory"); - - const auto x_dims = ctx->GetInputDim("X"); - const DataLayout data_layout = framework::StringToDataLayout( - ctx->Attrs().Get("data_layout")); - - PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, - "Input X must have 2 to 5 dimensions."); - - const int64_t C = - (data_layout == DataLayout::kNCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); - - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C); - - ctx->SetOutputDim("Y", x_dims); - ctx->SetOutputDim("MeanOut", {C}); - ctx->SetOutputDim("VarianceOut", {C}); - ctx->SetOutputDim("SavedMean", {C}); - ctx->SetOutputDim("SavedVariance", {C}); - ctx->ShareLoD("X", "Y"); +void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Scale"), + "Input(Scale) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Mean"), + "Input(Mean) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Variance"), + "Input(Variance) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), + "Output(Y) of ConvOp should not be null."); + bool is_test = ctx->Attrs().Get("is_test"); + if (!is_test) { + PADDLE_ENFORCE(ctx->HasOutput("MeanOut"), + "Output(MeanOut) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"), + "Output(VarianceOut) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), + "Output(SavedMean) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), + "Output(SavedVariance) of ConvOp should not be null."); } - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - auto input_data_type = ctx.Input("X")->type(); - // By default, the type of the scale, bias, mean, - // and var tensors should both be float. (For float or float16 input tensor) - // or double (For double input tensor). - auto bn_param_type = framework::proto::VarType::FP32; - if (input_data_type == framework::proto::VarType::FP64) { - bn_param_type = framework::proto::VarType::FP64; - } - PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Scale")->type(), - "Scale input should be of float type"); - PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Bias")->type(), - "Bias input should be of float type"); - PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Mean")->type(), - "Mean input should be of float type"); - PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Variance")->type(), - "Variance input should be of float type"); - - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::LibraryType library = framework::LibraryType::kPlain; - framework::DataLayout layout = framework::DataLayout::kAnyLayout; + // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python + PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0], + "Mean and MeanOut should share the same memory"); + PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0], ctx->Outputs("VarianceOut")[0], + "Variance and VarianceOut should share the same memory"); + + const auto x_dims = ctx->GetInputDim("X"); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "Input X must have 2 to 5 dimensions."); + + const int64_t C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C); + + ctx->SetOutputDim("Y", x_dims); + ctx->SetOutputDim("MeanOut", {C}); + ctx->SetOutputDim("VarianceOut", {C}); + ctx->SetOutputDim("SavedMean", {C}); + ctx->SetOutputDim("SavedVariance", {C}); + ctx->ShareLoD("X", "Y"); +} + +framework::OpKernelType BatchNormOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + auto input_data_type = ctx.Input("X")->type(); + // By default, the type of the scale, bias, mean, + // and var tensors should both be float. (For float or float16 input tensor) + // or double (For double input tensor). + auto bn_param_type = framework::proto::VarType::FP32; + if (input_data_type == framework::proto::VarType::FP64) { + bn_param_type = framework::proto::VarType::FP64; + } + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Scale")->type(), + "Scale input should be of float type"); + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Bias")->type(), + "Bias input should be of float type"); + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Mean")->type(), + "Mean input should be of float type"); + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Variance")->type(), + "Variance input should be of float type"); + + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; #ifdef PADDLE_WITH_MKLDNN - if (library == framework::LibraryType::kPlain && - platform::CanMKLDNNBeUsed(ctx)) { - library = framework::LibraryType::kMKLDNN; - layout = framework::DataLayout::kMKLDNN; - } -#endif - - return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, - library); + if (library == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } -}; +#endif -class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddAttr("is_test", - "(bool, default false) Set to true for inference only, false " - "for training. Some layers may run faster when this is true.") - .SetDefault(false); - AddAttr("momentum", "").SetDefault(0.9); - AddAttr("epsilon", "") - .SetDefault(1e-5) - .AddCustomChecker([](const float &epsilon) { - PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, - "'epsilon' should be between 0.0 and 0.001."); - }); - AddAttr("data_layout", "").SetDefault("NCHW"); - AddInput("X", "The input tensor"); - AddInput("Scale", - "Scale is a 1-dimensional tensor of size C " - "that is applied to the output"); - AddInput("Bias", - "Bias is a 1-dimensional tensor of size C " - "that is applied to the output"); - AddInput("Mean", - "The global mean (for training) or " - "estimated mean (for testing)"); - AddInput("Variance", - "The global variance (for training) " - "or estimated Variance (for testing)"); - AddOutput("Y", "result after normalization"); - AddOutput("MeanOut", - "Share memory with Mean. " - "Store the global mean when training"); - AddOutput("VarianceOut", - "Share memory with Variance. " - "Store the global Variance when training"); - AddOutput("SavedMean", - "Mean of the current mini batch, " - "will apply to output when training") - .AsIntermediate(); - AddOutput("SavedVariance", - "Variance of the current mini batch, " - "will apply to output when training") - .AsIntermediate(); - AddAttr("use_mkldnn", - "(bool, default false) Only used in mkldnn kernel") - .SetDefault(false); - AddAttr("fuse_with_relu", - "(bool, default false) Only used in mkldnn kernel") - .SetDefault(false); - AddAttr("use_global_stats", - "(bool, default false) Whether to use global mean and " - "variance. In inference or test mode, set use_global_stats " - "to true or is_test true. the behavior is equivalent. " - "In train mode, when setting use_global_stats True, the " - "global mean and variance are also used during train time, " - "the BN acts as scaling and shiffting.") - .SetDefault(false); - AddComment(R"DOC( + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library); +} + +void BatchNormOpMaker::Make() { + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); + AddAttr("momentum", "").SetDefault(0.9); + AddAttr("epsilon", "") + .SetDefault(1e-5) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, + "'epsilon' should be between 0.0 and 0.001."); + }); + AddAttr("data_layout", "").SetDefault("NCHW"); + AddInput("X", "The input tensor"); + AddInput("Scale", + "Scale is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("Bias", + "Bias is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("Mean", + "The global mean (for training) or " + "estimated mean (for testing)"); + AddInput("Variance", + "The global variance (for training) " + "or estimated Variance (for testing)"); + AddOutput("Y", "result after normalization"); + AddOutput("MeanOut", + "Share memory with Mean. " + "Store the global mean when training"); + AddOutput("VarianceOut", + "Share memory with Variance. " + "Store the global Variance when training"); + AddOutput("SavedMean", + "Mean of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddOutput("SavedVariance", + "Variance of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("fuse_with_relu", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("use_global_stats", + "(bool, default false) Whether to use global mean and " + "variance. In inference or test mode, set use_global_stats " + "to true or is_test true. the behavior is equivalent. " + "In train mode, when setting use_global_stats True, the " + "global mean and variance are also used during train time, " + "the BN acts as scaling and shiffting.") + .SetDefault(false); + AddComment(R"DOC( Batch Normalization. Batch Norm has been implemented as discussed in the paper: @@ -173,17 +178,7 @@ The required data format for this layer is one of the following: 2. NCHW `[batch, in_channels, in_height, in_width]` )DOC"); - } -}; - -class BatchNormOpInferVarType - : public framework::PassInDtypeAndVarTypeToOutput { - protected: - std::unordered_map GetInputOutputWithSameType() - const override { - return std::unordered_map{{"X", /*->*/ "Y"}}; - } -}; +} template class BatchNormKernel @@ -336,82 +331,75 @@ class BatchNormKernel } }; -class BatchNormGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - // check input - PADDLE_ENFORCE(ctx->HasInput("X")); - PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), - "Input(Y@GRAD) should not be null."); - PADDLE_ENFORCE(ctx->HasInput("SavedMean"), - "Input(SavedMean) should not be null."); - PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), - "Input(SavedVariance) should not be null"); - - // check output - PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), ""); - if (ctx->HasOutput(framework::GradVarName("Scale"))) { - PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), - "Output(Scale@GRAD) and Output(Bias@GRAD) should not be " - "null at same time"); - } - const bool use_global_stats = ctx->Attrs().Get("use_global_stats"); - if (use_global_stats) { - PADDLE_ENFORCE(!ctx->Attrs().Get("use_mkldnn"), - "Using global stats during training is not supported " - "in gradient op kernel of batch_norm_mkldnn_op now."); - } +void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const { + // check input + PADDLE_ENFORCE(ctx->HasInput("X")); + PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("SavedMean"), + "Input(SavedMean) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), + "Input(SavedVariance) should not be null"); + + // check output + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), ""); + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), + "Output(Scale@GRAD) and Output(Bias@GRAD) should not be " + "null at same time"); + } + const bool use_global_stats = ctx->Attrs().Get("use_global_stats"); + if (use_global_stats) { + PADDLE_ENFORCE(!ctx->Attrs().Get("use_mkldnn"), + "Using global stats during training is not supported " + "in gradient op kernel of batch_norm_mkldnn_op now."); + } - const auto x_dims = ctx->GetInputDim("X"); - const DataLayout data_layout = framework::StringToDataLayout( - ctx->Attrs().Get("data_layout")); - const int C = - (data_layout == DataLayout::kNCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); + const auto x_dims = ctx->GetInputDim("X"); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); - ctx->SetOutputDim(framework::GradVarName("X"), x_dims); - if (ctx->HasOutput(framework::GradVarName("Scale"))) { - ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); - ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); - } + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); + ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); } +} - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - const auto *var = ctx.InputVar(framework::GradVarName("Y")); - if (var == nullptr) { - PADDLE_THROW("can't find Y@GRAD"); - } - const Tensor *t = nullptr; - if (var->IsType()) { - t = &var->Get(); - } else if (var->IsType()) { - t = &var->Get(); - } - if (t == nullptr) { - PADDLE_THROW("can't find Y@GRAD"); - } +framework::OpKernelType BatchNormGradOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + const auto *var = ctx.InputVar(framework::GradVarName("Y")); + if (var == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::LibraryType library = framework::LibraryType::kPlain; - framework::DataLayout layout = framework::DataLayout::kAnyLayout; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; #ifdef PADDLE_WITH_MKLDNN - if (library == framework::LibraryType::kPlain && - platform::CanMKLDNNBeUsed(ctx)) { - library = framework::LibraryType::kMKLDNN; - layout = framework::DataLayout::kMKLDNN; - } + if (library == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; + } #endif - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.GetPlace(), layout, library); - } -}; + return framework::OpKernelType(ctx.Input("X")->type(), ctx.GetPlace(), + layout, library); +} template class BatchNormGradKernel @@ -572,46 +560,36 @@ class BatchNormGradKernel } }; -class BatchNormGradMaker : public framework::SingleGradOpDescMaker { - public: - using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; - - protected: - std::unique_ptr Apply() const override { - auto *op = new framework::OpDesc(); - op->SetType("batch_norm_grad"); - op->SetInput("X", Input("X")); - op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); - - op->SetInput("Scale", Input("Scale")); - op->SetInput("Bias", Input("Bias")); - op->SetInput("SavedMean", Output("SavedMean")); - op->SetInput("SavedVariance", Output("SavedVariance")); - - // used when setting use_global_stats True during training - if (boost::get(GetAttr("use_global_stats"))) { - op->SetInput("Mean", Output("MeanOut")); - op->SetInput("Variance", Output("VarianceOut")); - } +std::unique_ptr BatchNormGradMaker::Apply() const { + auto *op = new framework::OpDesc(); + op->SetType(GradOpType()); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + + op->SetInput("Scale", Input("Scale")); + op->SetInput("Bias", Input("Bias")); + op->SetInput("SavedMean", Output("SavedMean")); + op->SetInput("SavedVariance", Output("SavedVariance")); + + // used when setting use_global_stats True during training + if (boost::get(GetAttr("use_global_stats"))) { + op->SetInput("Mean", Output("MeanOut")); + op->SetInput("Variance", Output("VarianceOut")); + } - op->SetAttrMap(Attrs()); + op->SetAttrMap(Attrs()); - op->SetOutput(framework::GradVarName("X"), InputGrad("X")); - op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale")); - op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); - return std::unique_ptr(op); - } -}; + return std::unique_ptr(op); +} -class BatchNormInplaceInToOut : public framework::InplaceInToOut { +class BatchNormInplaceInToOut : public framework::InplaceOpInference { public: - using InplaceInToOut::InplaceInToOut; - - protected: - std::unordered_map Apply( - const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { + std::unordered_map operator()( + const framework::OpDesc &op_desc) const override { std::unordered_map inplace_in_to_out = { {"Mean", "MeanOut"}, {"Variance", "VarianceOut"}, {"X", "Y"}, }; @@ -619,14 +597,10 @@ class BatchNormInplaceInToOut : public framework::InplaceInToOut { } }; -class BatchNormGradInplaceInToOut : public framework::InplaceInToOut { +class BatchNormGradInplaceInToOut : public framework::InplaceOpInference { public: - using InplaceInToOut::InplaceInToOut; - - protected: - std::unordered_map Apply( - const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { + std::unordered_map operator()( + const framework::OpDesc &op_desc) const override { std::unordered_map inplace_in_to_out = { // Scale, Bias, SavedMean, SavedVariance shape is [batch_size, C] {framework::GradVarName("Y"), framework::GradVarName("X")}, @@ -642,10 +616,10 @@ class BatchNormGradInplaceInToOut : public framework::InplaceInToOut { namespace ops = paddle::operators; REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, - ops::BatchNormOpInferVarType, ops::BatchNormGradMaker, - ops::BatchNormInplaceInToOut); -REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp, - ops::BatchNormGradInplaceInToOut); + ops::BatchNormOpInferVarType, ops::BatchNormGradMaker) +// ops::BatchNormInplaceInToOut); +REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp) +// ops::BatchNormGradInplaceInToOut); REGISTER_OP_CPU_KERNEL( batch_norm, ops::BatchNormKernel, diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index 1c45746a92ad057a97d9f65aa256df616fc37f3d..36d297ec5523b9e8a136c536165bdb4d3a380c25 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -33,26 +33,6 @@ using CudnnDataType = platform::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; -void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout, - int *N, int *C, int *H, int *W, int *D) { - *N = dims[0]; - if (dims.size() == 2) { - *C = dims[1]; - *H = 1; - *W = 1; - *D = 1; - } else { - *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; - *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; - *W = dims.size() > 3 - ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2]) - : 1; - *D = dims.size() > 4 - ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3]) - : 1; - } -} - template class BatchNormKernel : public framework::OpKernel { @@ -196,22 +176,6 @@ class BatchNormKernel } }; -template -static __global__ void KeBNBackwardData(const T *dy, - const BatchNormParamType *scale, - const BatchNormParamType *variance, - const double epsilon, const int C, - const int HxW, const int num, T *dx) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; - BatchNormParamType inv_var = 1.0 / sqrt(variance[c] + epsilon); - dx[i] = static_cast(static_cast>(dy[i]) * - scale[c] * inv_var); - } -} - template static __global__ void KeBNBackwardScaleBias( const T *dy, const T *x, const BatchNormParamType *mean, @@ -248,6 +212,22 @@ static __global__ void KeBNBackwardScaleBias( } } +template +static __global__ void KeBNBackwardData(const T *dy, + const BatchNormParamType *scale, + const BatchNormParamType *variance, + const double epsilon, const int C, + const int HxW, const int num, T *dx) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < num; i += stride) { + const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; + BatchNormParamType inv_var = 1.0 / sqrt(variance[c] + epsilon); + dx[i] = static_cast(static_cast>(dy[i]) * + scale[c] * inv_var); + } +} + template class BatchNormGradKernel : public framework::OpKernel { @@ -383,7 +363,7 @@ class BatchNormGradKernel KeBNBackwardScaleBias<<< grid2, block, 0, dev_ctx.stream()>>>( d_y->data(), x->data(), running_mean_data, running_var_data, - epsilon, C, H * W, num, d_scale->data>(), + epsilon, N, C, H * W * D, d_scale->data>(), d_bias->data>()); } } else { @@ -394,10 +374,10 @@ class BatchNormGradKernel running_var_data, epsilon, C, H * W, num, d_x->data()); } if (d_scale && d_bias) { - KeBNBackwardScaleBias<<< + KeBNBackwardScaleBias<<< grid2, block, 0, dev_ctx.stream()>>>( d_y->data(), x->data(), running_mean_data, running_var_data, - epsilon, C, H * W, num, d_scale->data>(), + epsilon, N, C, H * W * D, d_scale->data>(), d_bias->data>()); } } diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h index 5e3d630d6889e445c5e84fa836d2d81bb7266779..6e89d73eb236ee7844c7de3c273e0b0f275a3e33 100644 --- a/paddle/fluid/operators/batch_norm_op.h +++ b/paddle/fluid/operators/batch_norm_op.h @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -35,17 +38,84 @@ template using ConstEigenVectorArrayMap = Eigen::Map>; +class BatchNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override; +}; + +class BatchNormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override; +}; + +class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +class BatchNormGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override; + + virtual std::string GradOpType() const { + return this->ForwardOpType() + "_grad"; + } +}; + +class BatchNormOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Y"}}; + } +}; + template class BatchNormKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override; + void Compute(const framework::ExecutionContext &ctx) const override; }; template class BatchNormGradKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override; + void Compute(const framework::ExecutionContext &ctx) const override; }; +inline void ExtractNCWHD(const framework::DDim &dims, + const DataLayout &data_layout, int *N, int *C, int *H, + int *W, int *D) { + *N = dims[0]; + if (dims.size() == 2) { + *C = dims[1]; + *H = 1; + *W = 1; + *D = 1; + } else { + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; + *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; + *W = dims.size() > 3 + ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2]) + : 1; + *D = dims.size() > 4 + ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3]) + : 1; + } +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index cf78c83297a87beb08a8b8e6e4b182f03f1909d3..4cef49280dfb5207a9d94df42d94657f03ec838f 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -178,10 +178,10 @@ Beam Search Decode Operator. This Operator constructs the full hypotheses for each source sentence by walking back along the LoDTensorArray Input(ids) whose lods can be used to restore the path in the beam search tree. -The Output(SentenceIds) and Output(SentenceScores) separately contain the -generated id sequences and the corresponding scores. The shapes and lods of the -two LodTensor are same. The lod level is 2 and the two levels separately -indicate how many hypotheses each source sentence has and how many ids each +The Output(SentenceIds) and Output(SentenceScores) separately contain the +generated id sequences and the corresponding scores. The shapes and lods of the +two LodTensor are same. The lod level is 2 and the two levels separately +indicate how many hypotheses each source sentence has and how many ids each hypothesis has. )DOC"); } @@ -203,15 +203,12 @@ class BeamSearchDecodeInferShape : public framework::InferShapeBase { class BeamSearchDecodeInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { - for (auto& o : op_desc.Output("SentenceIds")) { - auto& sentence_ids = block->FindRecursiveOrCreateVar(o); - sentence_ids.SetType(framework::proto::VarType::LOD_TENSOR); + void operator()(framework::InferVarTypeContext* ctx) const override { + for (auto& o : ctx->Output("SentenceIds")) { + ctx->SetType(o, framework::proto::VarType::LOD_TENSOR); } - for (auto& o : op_desc.Output("SentenceScores")) { - auto& sentence_scores = block->FindRecursiveOrCreateVar(o); - sentence_scores.SetType(framework::proto::VarType::LOD_TENSOR); + for (auto& o : ctx->Output("SentenceScores")) { + ctx->SetType(o, framework::proto::VarType::LOD_TENSOR); } } }; diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc index e93cd8615e052e4dfc6255549bf7a9b84b7dd657..a6aa35e0569364d79c15aea6e6dbc6ca670d49f0 100644 --- a/paddle/fluid/operators/beam_search_op.cc +++ b/paddle/fluid/operators/beam_search_op.cc @@ -51,9 +51,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("selected_scores", "A LoDTensor containing the accumulated scores corresponding to " "Output(selected_ids)."); - AddOutput( - "parent_idx", - "A Tensor preserving the selected_ids' parent indice in pre_ids."); + AddOutput("parent_idx", + "A Tensor preserving the selected_ids' parent indice in pre_ids.") + .AsDispensable(); // Attributes stored in AttributeMap AddAttr("level", "the level of LoDTensor"); @@ -65,7 +65,7 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(true); AddComment(R"DOC( -This operator does the search in beams for one time step. +This operator does the search in beams for one time step. Specifically, it selects the top-K candidate word ids of current step from Input(ids) according to their Input(scores) for all source sentences, where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results @@ -120,15 +120,12 @@ class BeamSearchOp : public framework::OperatorWithKernel { class BeamSearchInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - for (auto &o : op_desc.Output("selected_ids")) { - auto &selected_ids = block->FindRecursiveOrCreateVar(o); - selected_ids.SetType(framework::proto::VarType::LOD_TENSOR); + void operator()(framework::InferVarTypeContext *ctx) const override { + for (auto &o : ctx->Output("selected_ids")) { + ctx->SetType(o, framework::proto::VarType::LOD_TENSOR); } - for (auto &o : op_desc.Output("selected_scores")) { - auto &selected_scores = block->FindRecursiveOrCreateVar(o); - selected_scores.SetType(framework::proto::VarType::LOD_TENSOR); + for (auto &o : ctx->Output("selected_scores")) { + ctx->SetType(o, framework::proto::VarType::LOD_TENSOR); } } }; diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h index f808020cc765585d1633c6c3bf528080a7e83f07..3d32ea0cc9686a709b185087d76d12f266663d03 100644 --- a/paddle/fluid/operators/beam_search_op.h +++ b/paddle/fluid/operators/beam_search_op.h @@ -44,7 +44,6 @@ class BeamSearchOpKernel : public framework::OpKernel { auto* parent_idx = context.Output("parent_idx"); PADDLE_ENFORCE_NOT_NULL(selected_ids); PADDLE_ENFORCE_NOT_NULL(selected_scores); - PADDLE_ENFORCE_NOT_NULL(parent_idx); math::BeamSearchFunctor alg; alg(context.template device_context(), pre_ids, pre_scores, diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc index 064903c299d947df3c6b42d916fce8dcbd85eebb..fec091255f6391b77cd2858905f3aa2e5dd8baff 100644 --- a/paddle/fluid/operators/benchmark/op_tester.cc +++ b/paddle/fluid/operators/benchmark/op_tester.cc @@ -42,8 +42,8 @@ void OpTester::Init(const OpTesterConfig &config) { // Initialize the OpDesc if (op_desc_info.Has(config_.op_type)) { type_ = config_.op_type; - op_desc_.SetType(config_.op_type); + CreateOpDesc(); CreateInputVarDesc(); CreateOutputVarDesc(); } else { @@ -131,6 +131,40 @@ std::vector OpTester::GetOpProtoOutputNames() { return output_names; } +std::unordered_map +OpTester::GetOpProtoAttrNames() { + std::unordered_map attr_types; + const framework::proto::OpProto &proto = + framework::OpInfoMap::Instance().Get(type_).Proto(); + const std::vector skipped_attrs = { + framework::OpProtoAndCheckerMaker::OpRoleAttrName(), + framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), + framework::OpProtoAndCheckerMaker::OpNamescopeAttrName(), + framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName()}; + for (int i = 0; i != proto.attrs_size(); ++i) { + const auto &attr = proto.attrs(i); + if (!Has(skipped_attrs, attr.name())) { + VLOG(4) << "attr: " << attr.name() << ", type: " << attr.type(); + attr_types[attr.name()] = attr.type(); + } + } + return attr_types; +} + +framework::proto::VarType::Type OpTester::TransToVarType(std::string str) { + if (str == "int32") { + return framework::proto::VarType::INT32; + } else if (str == "int64") { + return framework::proto::VarType::INT64; + } else if (str == "fp32") { + return framework::proto::VarType::FP32; + } else if (str == "fp64") { + return framework::proto::VarType::FP64; + } else { + PADDLE_THROW("Unsupported dtype %s.", str.c_str()); + } +} + void OpTester::CreateInputVarDesc() { std::vector input_names = GetOpProtoInputNames(); for (auto &name : input_names) { @@ -145,11 +179,11 @@ void OpTester::CreateInputVarDesc() { // Need to support more type var->SetType(framework::proto::VarType::LOD_TENSOR); var->SetPersistable(false); - var->SetDataType(framework::proto::VarType::FP32); + var->SetDataType(TransToVarType(input->dtype)); var->SetShape(input->dims); op_desc_.SetInput(name, {var_name}); - input_lods_[var_name] = input->lod; + inputs_[var_name] = *input; } } @@ -167,6 +201,49 @@ void OpTester::CreateOutputVarDesc() { } } +void OpTester::CreateOpDesc() { + op_desc_.SetType(config_.op_type); + std::unordered_map attr_types = + GetOpProtoAttrNames(); + for (auto item : config_.attrs) { + const std::string &name = item.first; + if (attr_types.find(name) == attr_types.end()) { + LOG(FATAL) << "Operator " << type_ << " do not have attr " << name; + } + + const std::string &value_str = item.second; + const framework::proto::AttrType &type = attr_types[name]; + switch (type) { + case framework::proto::AttrType::BOOLEAN: + break; + case framework::proto::AttrType::INT: { + int value = StringTo(value_str); + op_desc_.SetAttr(name, {value}); + } break; + case framework::proto::AttrType::FLOAT: { + float value = StringTo(value_str); + op_desc_.SetAttr(name, {value}); + } break; + case framework::proto::AttrType::STRING: { + op_desc_.SetAttr(name, {value_str}); + } break; + case framework::proto::AttrType::BOOLEANS: + case framework::proto::AttrType::INTS: + case framework::proto::AttrType::FLOATS: + case framework::proto::AttrType::STRINGS: + LOG(FATAL) << "Not supported yet."; + break; + case framework::proto::AttrType::LONG: { + int64_t value = StringTo(value_str); + op_desc_.SetAttr(name, value); + } break; + case framework::proto::AttrType::LONGS: + default: + PADDLE_THROW("Unsupport attr type %d", type); + } + } +} + framework::VarDesc *OpTester::Var(const std::string &name) { auto it = vars_.find(name); if (it != vars_.end()) { @@ -179,24 +256,41 @@ framework::VarDesc *OpTester::Var(const std::string &name) { template void OpTester::SetupTensor(framework::LoDTensor *tensor, - const std::vector &shape, T lower, - T upper) { + const std::vector &shape, T lower, T upper, + const std::string &initializer) { static unsigned int seed = 100; std::mt19937 rng(seed++); std::uniform_real_distribution uniform_dist(0, 1); T *ptr = tensor->mutable_data(framework::make_ddim(shape), place_); - if (platform::is_cpu_place(place_)) { - for (int i = 0; i < tensor->numel(); ++i) { - ptr[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); - } + + framework::LoDTensor cpu_tensor; + T *cpu_ptr = nullptr; + + if (!platform::is_cpu_place(place_)) { + cpu_ptr = cpu_tensor.mutable_data(framework::make_ddim(shape), + platform::CPUPlace()); } else { - framework::LoDTensor cpu_tensor; - T *cpu_ptr = cpu_tensor.mutable_data(framework::make_ddim(shape), - platform::CPUPlace()); + cpu_ptr = ptr; + } + + if (initializer == "random") { for (int i = 0; i < cpu_tensor.numel(); ++i) { cpu_ptr[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); } + } else if (initializer == "natural") { + for (int i = 0; i < cpu_tensor.numel(); ++i) { + cpu_ptr[i] = lower + i; + } + } else if (initializer == "zeros") { + for (int i = 0; i < cpu_tensor.numel(); ++i) { + cpu_ptr[i] = 0; + } + } else { + PADDLE_THROW("Unsupported initializer %s.", initializer.c_str()); + } + + if (!platform::is_cpu_place(place_)) { TensorCopySync(cpu_tensor, place_, tensor); } } @@ -219,7 +313,7 @@ void OpTester::CreateVariables(framework::Scope *scope) { } } - for (auto &item : input_lods_) { + for (auto &item : inputs_) { // Allocate memory for input tensor auto &var_name = item.first; VLOG(3) << "Allocate memory for tensor " << var_name; @@ -229,11 +323,23 @@ void OpTester::CreateVariables(framework::Scope *scope) { auto *var = scope->Var(var_name); auto *tensor = var->GetMutable(); - SetupTensor(tensor, shape, static_cast(0.0), - static_cast(1.0)); + const auto &data_type = var_desc->GetDataType(); + if (data_type == framework::proto::VarType::INT32) { + SetupTensor(tensor, shape, 0, 1, item.second.initializer); + } else if (data_type == framework::proto::VarType::INT64) { + SetupTensor(tensor, shape, 0, 1, item.second.initializer); + } else if (data_type == framework::proto::VarType::FP32) { + SetupTensor(tensor, shape, static_cast(0.0), + static_cast(1.0), item.second.initializer); + } else if (data_type == framework::proto::VarType::FP64) { + SetupTensor(tensor, shape, static_cast(0.0), + static_cast(1.0), item.second.initializer); + } else { + PADDLE_THROW("Unsupported dtype %d.", data_type); + } VLOG(3) << "Set lod for tensor " << var_name; - std::vector> &lod_vec = item.second; + std::vector> &lod_vec = item.second.lod; framework::LoD lod; for (size_t i = 0; i < lod_vec.size(); ++i) { lod.push_back(lod_vec[i]); @@ -261,7 +367,16 @@ std::string OpTester::DebugString() { ss << GenSpaces(count) << "type: LOD_TENSOR\n"; ss << GenSpaces(count++) << "lod_tensor {\n"; ss << GenSpaces(count++) << "tensor {\n"; - ss << GenSpaces(count) << "data_type: FP32\n"; + const auto &data_type = var->GetDataType(); + if (data_type == framework::proto::VarType::INT32) { + ss << GenSpaces(count) << "data_type: INT32\n"; + } else if (data_type == framework::proto::VarType::INT64) { + ss << GenSpaces(count) << "data_type: INT64\n"; + } else if (data_type == framework::proto::VarType::FP32) { + ss << GenSpaces(count) << "data_type: FP32\n"; + } else if (data_type == framework::proto::VarType::FP64) { + ss << GenSpaces(count) << "data_type: FP64\n"; + } std::vector shape = var->GetShape(); for (auto d : shape) { ss << GenSpaces(count) << "dims: " << d << "\n"; @@ -288,6 +403,63 @@ std::string OpTester::DebugString() { ss << GenSpaces(--count) << "}\n"; } ss << GenSpaces(count) << "type: " << op_desc_.Type() << "\n"; + for (auto &name : op_desc_.AttrNames()) { + ss << GenSpaces(count++) << "attrs {\n"; + const auto &attr_type = op_desc_.GetAttrType(name); + const auto &attr = op_desc_.GetAttr(name); + ss << GenSpaces(count) << "name: \"" << name << "\"\n"; + switch (attr_type) { + case framework::proto::AttrType::BOOLEAN: { + ss << GenSpaces(count) << "type: BOOLEAN\n"; + ss << GenSpaces(count) << "b: " << boost::get(attr) << "\n"; + } break; + case framework::proto::AttrType::INT: { + ss << GenSpaces(count) << "type: INT\n"; + ss << GenSpaces(count) << "i: " << boost::get(attr) << "\n"; + } break; + case framework::proto::AttrType::FLOAT: { + ss << GenSpaces(count) << "type: FLOAT\n"; + ss << GenSpaces(count) << "f: " << boost::get(attr) << "\n"; + } break; + case framework::proto::AttrType::STRING: { + ss << GenSpaces(count) << "type: STRING\n"; + ss << GenSpaces(count) << "s: \"" << boost::get(attr) + << "\"\n"; + } break; + case framework::proto::AttrType::BOOLEANS: { + ss << GenSpaces(count) << "type: BOOLEANS\n"; + ss << GenSpaces(count) << "bools: " + << "\n"; + } break; + case framework::proto::AttrType::INTS: { + ss << GenSpaces(count) << "type: INTS\n"; + ss << GenSpaces(count) << "ints: " + << "\n"; + } break; + case framework::proto::AttrType::FLOATS: { + ss << GenSpaces(count) << "type: FLOATS\n"; + ss << GenSpaces(count) << "floats: " + << "\n"; + } break; + case framework::proto::AttrType::STRINGS: { + ss << GenSpaces(count) << "type: STRINGS\n"; + ss << GenSpaces(count) << "strings: " + << "\n"; + } break; + case framework::proto::AttrType::LONG: { + ss << GenSpaces(count) << "type: LONG\n"; + ss << GenSpaces(count) << "l: " << boost::get(attr) << "\n"; + } break; + case framework::proto::AttrType::LONGS: { + ss << GenSpaces(count) << "type: LONGS\n"; + ss << GenSpaces(count) << "longs: " + << "\n"; + } break; + default: + PADDLE_THROW("Unsupport attr type %d", attr_type); + } + ss << GenSpaces(--count) << "}\n"; + } ss << GenSpaces(--count) << "}\n"; return ss.str(); } @@ -299,6 +471,7 @@ TEST(op_tester, base) { FLAGS_op_config_list.c_str()); std::vector op_configs; while (!fin.eof()) { + VLOG(4) << "Reading config " << op_configs.size() << "..."; OpTesterConfig config; bool result = config.Init(fin); if (result) { diff --git a/paddle/fluid/operators/benchmark/op_tester.h b/paddle/fluid/operators/benchmark/op_tester.h index 8f150b23ad783acdfd203d471d578ab6aae71494..328389293c4b71a2f1fefbc3bf26fd46b79ec6e2 100644 --- a/paddle/fluid/operators/benchmark/op_tester.h +++ b/paddle/fluid/operators/benchmark/op_tester.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once +#include #include +#include #include #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/op_desc.h" @@ -39,16 +41,21 @@ class OpTester { private: std::vector GetOpProtoInputNames(); std::vector GetOpProtoOutputNames(); + std::unordered_map + GetOpProtoAttrNames(); + framework::proto::VarType::Type TransToVarType(std::string str); void CreateInputVarDesc(); void CreateOutputVarDesc(); + void CreateOpDesc(); framework::VarDesc *Var(const std::string &name); void CreateVariables(framework::Scope *scope); template void SetupTensor(framework::LoDTensor *input, - const std::vector &shape, T lower, T upper); + const std::vector &shape, T lower, T upper, + const std::string &initializer); void RunImpl(); @@ -57,7 +64,7 @@ class OpTester { std::string type_; framework::OpDesc op_desc_; std::unordered_map> vars_; - std::unordered_map>> input_lods_; + std::unordered_map inputs_; std::unique_ptr op_; platform::Place place_; std::unique_ptr scope_; diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc index 8336804ec07d2b7b176f55ad4113452086296494..b4878ab04244cf6b54d323943fc1fbf4e3882660 100644 --- a/paddle/fluid/operators/benchmark/op_tester_config.cc +++ b/paddle/fluid/operators/benchmark/op_tester_config.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/benchmark/op_tester_config.h" #include -#include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -40,6 +39,62 @@ static void EraseEndSep(std::string* str, } } +OpInputConfig::OpInputConfig(std::istream& is) { + std::string sep; + is >> sep; + if (sep == kStartSeparator) { + while (sep != kEndSeparator) { + is >> sep; + if (sep == "name" || sep == "name:") { + is >> name; + EraseEndSep(&name); + } else if (sep == "dtype" || sep == "dtype:") { + ParseDType(is); + } else if (sep == "initializer" || sep == "initializer:") { + ParseInitializer(is); + } else if (sep == "dims" || sep == "dims:") { + ParseDims(is); + } else if (sep == "lod" || sep == "lod:") { + ParseLoD(is); + } + } + } +} + +void OpInputConfig::ParseDType(std::istream& is) { + std::string dtype_str; + is >> dtype_str; + EraseEndSep(&dtype_str); + + if (dtype_str == "int32" || dtype_str == "int") { + dtype = "int32"; + } else if (dtype_str == "int64" || dtype_str == "long") { + dtype = "int64"; + } else if (dtype_str == "fp32" || dtype_str == "float") { + dtype = "fp32"; + } else if (dtype_str == "fp64" || dtype_str == "double") { + dtype = "fp64"; + } else { + PADDLE_THROW("Unsupported dtype %s", dtype_str.c_str()); + } + VLOG(4) << "dtype of input " << name << " is: " << dtype; +} + +void OpInputConfig::ParseInitializer(std::istream& is) { + std::string initializer_str; + is >> initializer_str; + EraseEndSep(&initializer_str); + + const std::vector supported_initializers = {"random", "natural", + "zeros"}; + if (!Has(supported_initializers, initializer_str)) { + PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str()); + } + + initializer = initializer_str; + VLOG(4) << "initializer of input " << name << " is: " << initializer; +} + void OpInputConfig::ParseDims(std::istream& is) { std::string dims_str; is >> dims_str; @@ -84,7 +139,7 @@ void OpInputConfig::ParseLoD(std::istream& is) { number += lod_str[i]; ++i; } - level.push_back(atoi(number.c_str())); + level.push_back(StringTo(number)); } lod.push_back(level); } else if (lod_str[i] == '}') { @@ -93,24 +148,6 @@ void OpInputConfig::ParseLoD(std::istream& is) { } } -OpInputConfig::OpInputConfig(std::istream& is) { - std::string sep; - is >> sep; - if (sep == kStartSeparator) { - while (sep != kEndSeparator) { - is >> sep; - if (sep == "name" || sep == "name:") { - is >> name; - EraseEndSep(&name); - } else if (sep == "dims" || sep == "dims:") { - ParseDims(is); - } else if (sep == "lod" || sep == "lod:") { - ParseLoD(is); - } - } - } -} - OpTesterConfig::OpTesterConfig(const std::string& filename) { std::ifstream fin(filename, std::ios::in | std::ios::binary); PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", @@ -167,6 +204,7 @@ bool OpTesterConfig::ParseAttrs(std::istream& is) { is >> value; EraseEndSep(&key, ":"); EraseEndSep(&value); + VLOG(4) << "attrs: " << key << ", " << value; attrs[key] = value; } diff --git a/paddle/fluid/operators/benchmark/op_tester_config.h b/paddle/fluid/operators/benchmark/op_tester_config.h index c2ff6dafc053eb7202a686954d53ae6f3d62d02e..5803f82ac28867a481875c2af607290c5d366146 100644 --- a/paddle/fluid/operators/benchmark/op_tester_config.h +++ b/paddle/fluid/operators/benchmark/op_tester_config.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include #include @@ -27,10 +28,14 @@ struct OpInputConfig { OpInputConfig() {} explicit OpInputConfig(std::istream& is); + void ParseDType(std::istream& is); + void ParseInitializer(std::istream& is); void ParseDims(std::istream& is); void ParseLoD(std::istream& is); std::string name; + std::string dtype{"fp32"}; // int32/int, int64/long, fp32/float, fp64/double + std::string initializer{"random"}; // random, natural std::vector dims; std::vector> lod; }; @@ -55,6 +60,23 @@ struct OpTesterConfig { double runtime{0.0}; }; +static bool Has(const std::vector& vec, const std::string& item) { + for (size_t i = 0; i < vec.size(); ++i) { + if (vec[i] == item) { + return true; + } + } + return false; +} + +template +T StringTo(const std::string& str) { + std::istringstream is(str); + T value; + is >> value; + return value; +} + } // namespace benchmark } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc index 8d6a498dc941e44688ec8a2b49a6e080608f9b85..0c517cc757ca3f6f1ff7f4191ab2d529890b7154 100644 --- a/paddle/fluid/operators/cast_op.cc +++ b/paddle/fluid/operators/cast_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cast_op.h" +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/float16.h" @@ -30,7 +31,8 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker { Cast Operator. This Operator casts the input tensor to another data type and -returns tha Output Tensor. +returns the Output Tensor. It's meaningless if the output dtype equals +the input dtype, but it's fine if you do so. )DOC"); } diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc index eae86a373be278cbb3ea9425b2ff0169f8faa99e..5720b295ecf8171540803aaadff43dfdcb20553b 100644 --- a/paddle/fluid/operators/clip_by_norm_op.cc +++ b/paddle/fluid/operators/clip_by_norm_op.cc @@ -14,69 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/clip_by_norm_op.h" -namespace paddle { -namespace operators { - -class ClipByNormOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of ClipByNormOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of ClipByNormOp should not be null."); - auto max_norm = ctx->Attrs().Get("max_norm"); - PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0."); - auto x_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", x_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } -}; - -class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "(Tensor) The input of clip_by_norm op." - "The number of dimensions must be between [1, 9]."); - AddOutput("Out", - "(Tensor) The output of clip_by_norm op with shape as input(X)"); - AddAttr("max_norm", "(float) The maximum norm value."); - AddComment(R"DOC( -ClipByNorm Operator. - -This operator limits the L2 norm of the input $X$ within $max\_norm$. -If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be -the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will -be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as -shown in the following formula: - -$$ -Out = \\frac{max\\_norm * X}{norm(X)}, -$$ - -where $norm(X)$ represents the L2 norm of $X$. - -Examples: - .. code-block:: python - - data = fluid.layer.data( - name='data', shape=[2, 4, 6], dtype='float32') - reshaped = fluid.layers.clip_by_norm( - x=data, max_norm=0.5) - -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp, ops::ClipByNormOpMaker); + REGISTER_OP_CPU_KERNEL( clip_by_norm, ops::ClipByNormKernel); diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index 49e734ce96b0d38b59102575250a020e6924362a..d8baa4b8b235fdea7a3dc51ac7db1c004d49334a 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -83,5 +83,59 @@ class ClipByNormKernel : public framework::OpKernel { } }; +class ClipByNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ClipByNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ClipByNormOp should not be null."); + auto max_norm = ctx->Attrs().Get("max_norm"); + PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0."); + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor) The input of clip_by_norm op." + "The number of dimensions must be between [1, 9]."); + AddOutput("Out", + "(Tensor) The output of clip_by_norm op with shape as input(X)"); + AddAttr("max_norm", "(float) The maximum norm value."); + AddComment(R"DOC( +ClipByNorm Operator. + +This operator limits the L2 norm of the input $X$ within $max\_norm$. +If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be +the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will +be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as +shown in the following formula: + +$$ +Out = \\frac{max\\_norm * X}{norm(X)}, +$$ + +where $norm(X)$ represents the L2 norm of $X$. + +Examples: + .. code-block:: python + + data = fluid.layer.data( + name='data', shape=[2, 4, 6], dtype='float32') + reshaped = fluid.layers.clip_by_norm( + x=data, max_norm=0.5) + +)DOC"); + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc index a679f7e2536a0a44148193f423f5ffe11b5e35fc..4fc6ae365ec61326670775ab13b854235f19266f 100644 --- a/paddle/fluid/operators/clip_op.cc +++ b/paddle/fluid/operators/clip_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/clip_op.h" +#include namespace paddle { namespace operators { @@ -76,12 +77,28 @@ class ClipOpGrad : public framework::OperatorWithKernel { } }; +class ClipGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("clip_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(clip, ops::ClipOp, ops::ClipOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ClipGradOpDescMaker); REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad); REGISTER_OP_CPU_KERNEL( clip, ops::ClipKernel); diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 194f9cf5033a3a73afeb8e92ddbdcc7b316fcd35..1f71555180361a1522b7a1c8383fe128bc4edcd0 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/concat_op.h" +#include #include #include @@ -50,9 +51,19 @@ class ConcatOp : public framework::OperatorWithKernel { if (j == axis) { out_dims[axis] += ins[i][j]; } else { - PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j], - "Input tensors should have the same " - "elements except the specify axis."); + if (ctx->IsRuntime()) { + // check all shape in run time + PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j], + "Input tensors should have the same " + "elements except the specify axis."); + } else { + // not check -1 with other in compile time + if (out_dims[j] > 0 && ins[i][j] > 0) { + PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j], + "Input tensors should have the same " + "elements except the specify axis."); + } + } } } } @@ -110,11 +121,7 @@ Examples: class ConcatOpGrad : public framework::OperatorWithKernel { public: - ConcatOpGrad(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorWithKernel(type, inputs, outputs, attrs) {} + using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { auto in_x = "X"; @@ -132,6 +139,33 @@ class ConcatOpGrad : public framework::OperatorWithKernel { } } } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ConcatOpGradNoNeedBufferVarInference, + "X"); + +class ConcatGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("concat_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X", false)); + op->SetAttrMap(Attrs()); + return op; + } }; } // namespace operators @@ -139,9 +173,9 @@ class ConcatOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker, - paddle::framework::DefaultGradOpDescMaker< - false> /* set false to disable empty grad */); -REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad); + ops::ConcatGradOpDescMaker); +REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad, + ops::ConcatOpGradNoNeedBufferVarInference); REGISTER_OP_CPU_KERNEL( concat, ops::ConcatKernel, ops::ConcatKernel, diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index b614e9b03502634a29333f331e25201a0f77ba38..7aa1c44eaafe53034b19ee52c59cc94d3a1269da 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -1,4 +1,5 @@ include(operators) register_operators(DEPS naive_executor) +cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator) file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc index 1a157688f3d02185d18b66ff5ba3613b6cf438ad..fa77f97419b6d605e478709e13413606ff124572 100644 --- a/paddle/fluid/operators/controlflow/get_places_op.cc +++ b/paddle/fluid/operators/controlflow/get_places_op.cc @@ -93,11 +93,9 @@ execution. class GetPlacesInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - for (auto &o_name : op_desc.Output("Out")) { - block->FindRecursiveOrCreateVar(o_name).SetType( - framework::proto::VarType::PLACE_LIST); + void operator()(framework::InferVarTypeContext *ctx) const override { + for (auto &o_name : ctx->Output("Out")) { + ctx->SetType(o_name, framework::proto::VarType::PLACE_LIST); } } }; diff --git a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc index fa18ade3234ed1802bb44ad622f9041dc73d84ee..45f18ac9255bdd75d8cbb5e1dd30ebba52260850 100644 --- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc +++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc @@ -100,16 +100,13 @@ class WriteToArrayInferShape : public framework::InferShapeBase { class WriteToArrayInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - auto x_name = op_desc.Input("X")[0]; - auto out_name = op_desc.Output("Out")[0]; + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = ctx->Input("X")[0]; + auto out_name = ctx->Output("Out")[0]; VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY"; - auto &out = block->FindRecursiveOrCreateVar(out_name); - out.SetType(framework::proto::VarType::LOD_TENSOR_ARRAY); - auto *x = block->FindVarRecursive(x_name); - if (x != nullptr) { - out.SetDataType(x->GetDataType()); + ctx->SetType(out_name, framework::proto::VarType::LOD_TENSOR_ARRAY); + if (ctx->HasVar(x_name)) { + ctx->SetDataType(out_name, ctx->GetDataType(x_name)); } } }; diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 0360cf5273591946570cac47e2578e43f498b550..b3219208825cd1aea4c869064ff8f5fa8d3300fd 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/operators/detail/safe_ref.h" namespace paddle { @@ -26,14 +27,6 @@ namespace operators { using StepScopeVar = std::vector; using LoDTensor = framework::LoDTensor; -static constexpr char kStepBlock[] = "sub_block"; -static constexpr char kCondition[] = "Condition"; -static constexpr char kStepScopes[] = "StepScopes"; -static constexpr char kX[] = "X"; -static constexpr char kXGRAD[] = "X@GRAD"; -static constexpr char kOutputs[] = "Out"; -static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; - namespace { // NOLINT static std::string GetSkipEagerDeletionVarsDebugString( const std::vector &vars) { @@ -58,6 +51,7 @@ class WhileOp : public framework::OperatorBase { void RunImpl(const framework::Scope &scope, const platform::Place &dev_place) const override { PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition))); + auto &cond = scope.FindVar(Input(kCondition))->Get(); PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1})); @@ -77,13 +71,34 @@ class WhileOp : public framework::OperatorBase { VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); auto ctx = executor.Prepare(*program, block->ID(), skip_vars); - while (cond.data()[0]) { + if (!is_test) { + while (cond.data()[0]) { + auto ¤t_scope = scope.NewScope(); + step_scopes->push_back(¤t_scope); + executor.RunPreparedContext(ctx.get(), ¤t_scope, false, true, + true); + } + } else { auto ¤t_scope = scope.NewScope(); - step_scopes->push_back(¤t_scope); - executor.RunPreparedContext(ctx.get(), ¤t_scope, false, true, true); - if (is_test) { - scope.DeleteScope(¤t_scope); + executor.CreateVariables(*program, ¤t_scope, block->ID()); + while (cond.data()[0]) { + for (auto &name : current_scope.LocalVarNames()) { + auto *var = current_scope.Var(name); + if (var->IsType()) { + // Clear all lod information for all lod_tensors. + auto *t = var->GetMutable(); + framework::LoD empty_lod; + t->set_lod(empty_lod); + } else if (var->IsType()) { + // Clear elements of all tensor arrays. + auto *t = var->GetMutable(); + t->clear(); + } + } + executor.RunPreparedContext(ctx.get(), ¤t_scope, false, false, + false); } + scope.DeleteScope(¤t_scope); } } }; @@ -372,19 +387,16 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { class WhileGradOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - auto p_names = op_desc.Input(kX); - auto pg_ig_names = op_desc.Output(framework::GradVarName(kX)); + void operator()(framework::InferVarTypeContext *ctx) const override { + auto p_names = ctx->Input(kX); + auto pg_ig_names = ctx->Output(framework::GradVarName(kX)); for (size_t i = 0; i < p_names.size(); ++i) { - auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i])); - auto *g_var = block->FindVarRecursive(pg_ig_names[i]); - if (g_var != nullptr) { // Gradient could be @EMPTY@ + if (ctx->HasVar(pg_ig_names[i])) { VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i] - << " type: " << p_var.GetType(); - g_var->SetType(p_var.GetType()); - g_var->SetDataType(p_var.GetDataType()); + << " type: " << ctx->GetType(p_names[i]); + ctx->SetType(pg_ig_names[i], ctx->GetType(p_names[i])); + ctx->SetDataType(pg_ig_names[i], ctx->GetDataType(p_names[i])); } } } diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..2cbd94a061b5b369d67b6e0995d6b8fd45801828 --- /dev/null +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -0,0 +1,291 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/controlflow/while_op_helper.h" +#include +#include +#include +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace operators { + +// OpVariant is a wrapper class of OpDesc and OperatorBase +// So that API would be the same. +class OpVariant { + struct InputsVisitor + : public boost::static_visitor { + template + const framework::VariableNameMap *operator()(const OpType *op) const { + return &(op->Inputs()); + } + }; + + struct OutputsVisitor + : public boost::static_visitor { + template + const framework::VariableNameMap *operator()(const OpType *op) const { + return &(op->Outputs()); + } + }; + + struct AttributeMapVisitor + : public boost::static_visitor { + const framework::AttributeMap *operator()( + const framework::OpDesc *op) const { + return &(op->GetAttrMap()); + } + + const framework::AttributeMap *operator()( + const framework::OperatorBase *op) const { + return &(op->Attrs()); + } + }; + + struct RawPointerVisitor : public boost::static_visitor { + template + const void *operator()(const OpType *op) const { + return op; + } + }; + + public: + OpVariant(const framework::OperatorBase *op) : op_(op) {} // NOLINT + + OpVariant(const framework::OpDesc *op) : op_(op) {} // NOLINT + + const framework::VariableNameMap &Inputs() const { + return *boost::apply_visitor(InputsVisitor(), op_); + } + + const framework::VariableNameMap &Outputs() const { + return *boost::apply_visitor(OutputsVisitor(), op_); + } + + const framework::AttributeMap &Attrs() const { + return *boost::apply_visitor(AttributeMapVisitor(), op_); + } + + template + const AttrType &Attr(const std::string &name) const { + auto &attrs = Attrs(); + auto it = attrs.find(name); + PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name); + return boost::get(it->second); + } + + bool operator==(const OpVariant &other) const { + return RawPointer() == other.RawPointer(); + } + + const void *RawPointer() const { + return boost::apply_visitor(RawPointerVisitor(), op_); + } + + int which() const { return static_cast(op_.which()); } + + struct Hasher { + size_t operator()(const OpVariant &op) const { + return reinterpret_cast(op.RawPointer()); + } + }; + + private: + const boost::variant + op_; +}; + +static std::string GetDebugString(const std::vector &names) { + if (names.empty()) return ""; + std::string ret = names[0]; + for (size_t i = 1; i < names.size(); ++i) { + ret += (" " + names[i]); + } + return ret; +} + +// Set skip variables of while_op and while_grad_op +// These variables should be skipped when eager deletion enables. +// It is because: +// 1. while_grad_op needs some variables defined in while_op. +// 2. while_grad_op needs variables from the previous time step. +static void SetSkipVars(const OpVariant &op, std::vector attr) { + auto &attrs = const_cast(op.Attrs()); + VLOG(2) << "Prepare to skip " << attr.size() + << " var(s): " << GetDebugString(attr); + attrs[kSkipEagerDeletionVars] = std::move(attr); +} + +// Check whether the forward while_op and while_grad_op match +// The program may have many while_ops. +static bool IsMatchedWhileOpAndWhileGradOp(const OpVariant &fwd_op, + const OpVariant &grad_op) { + return fwd_op.Inputs().at(kX) == grad_op.Inputs().at(kX) && + fwd_op.Outputs().at(kOutputs) == grad_op.Inputs().at(kOutputs); +} + +// Test whether the variable is skippable in forward while_op +// The variable is skippable in while_op when the variable used in while_grad +// is not from grad_block. +static bool IsSkippableVar(const std::string &name, + framework::BlockDesc *grad_block) { + return name != framework::kEmptyVarName && !grad_block->HasVar(name); +} + +static void ModifyWhileOpAndWhileGradOpAttr(const OpVariant &fwd_op, + const OpVariant &bwd_op) { + auto *grad_block = bwd_op.Attr(kStepBlock); + + // Find all skippable variables in forward while_op + std::unordered_set forward_skip_vars; + for (auto *op_desc : grad_block->AllOps()) { + for (auto &in_arg_name : op_desc->InputArgumentNames()) { + if (IsSkippableVar(in_arg_name, grad_block)) { + forward_skip_vars.insert(in_arg_name); + } + } + + for (auto &out_arg_name : op_desc->OutputArgumentNames()) { + if (IsSkippableVar(out_arg_name, grad_block)) { + forward_skip_vars.insert(out_arg_name); + } + } + } + + SetSkipVars(fwd_op, std::vector(forward_skip_vars.begin(), + forward_skip_vars.end())); + + // Find all skippable variables in while_grad_op + // The skipped variables are those which would be used across time steps. + auto &fwd_input = fwd_op.Inputs().at(kX); + auto &in_grads = bwd_op.Outputs().at(framework::GradVarName(kX)); + PADDLE_ENFORCE_EQ( + fwd_input.size(), in_grads.size(), + "Backward input gradient number does not match forward input number."); + + std::unordered_set backward_skip_vars; + for (size_t i = 0; i < in_grads.size(); ++i) { + if (in_grads[i] == framework::kEmptyVarName) { + continue; + } + backward_skip_vars.insert(in_grads[i]); + backward_skip_vars.insert(framework::GradVarName(fwd_input[i])); + } + + SetSkipVars(bwd_op, std::vector(backward_skip_vars.begin(), + backward_skip_vars.end())); +} + +// Find all while_ops and while_grad_ops in the graph or program +// The while_grad_op and while_op may located in different blocks +// So we should traverse all blocks in the program and find them out. +static void FindAllWhileAndWhileGradOp(std::vector *while_ops, + std::vector *while_grad_ops) { + PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size()); + + if (while_ops->empty()) return; + + const auto *program = + while_ops->front().Attr(kStepBlock)->Program(); + for (size_t i = 1; i < program->Size(); ++i) { + auto &block = program->Block(i); + for (size_t j = 0; j < block.OpSize(); ++j) { + auto *op = block.Op(j); + if (op->Type() == "while") { + while_ops->emplace_back(op); + } else if (op->Type() == "while_grad") { + while_grad_ops->emplace_back(op); + } + } + } + + PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size(), + "There are extra while_grad ops in the graph or program"); +} + +static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl( + std::vector *while_ops, std::vector *while_grad_ops) { + FindAllWhileAndWhileGradOp(while_ops, while_grad_ops); + + VLOG(2) << "Found while op num: " << while_ops->size() + << ", while grad op num: " << while_grad_ops->size(); + + if (while_grad_ops->empty()) { + return; + } + + std::unordered_set while_op_set( + while_ops->begin(), while_ops->end()); + + for (auto &bwd_op : *while_grad_ops) { + const OpVariant *matched_fwd_op = nullptr; + for (auto &fwd_op : while_op_set) { + if (IsMatchedWhileOpAndWhileGradOp(fwd_op, bwd_op)) { + PADDLE_ENFORCE(matched_fwd_op == nullptr, + "Found multiple matched while ops"); + matched_fwd_op = &fwd_op; + } + } + PADDLE_ENFORCE_NOT_NULL(matched_fwd_op, + "Cannot find matched forward while op."); + ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op); + while_op_set.erase(*matched_fwd_op); + } +} + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + int block_id, + const std::vector> &all_ops) { + // If block_id is not 0, returns + // This is because all while_ops and while_grad_ops in the whole program + // would be processed when block_id is 0 (i.e. when Executor::Run() or + // ParallelExecutor constructs). + + // What's more, all while_ops and while_grad_ops must be processed when + // block_id is zero. If not, while_op may run first and erase variables + // used in while_grad_op, and in this moment, while_grad_ops may be not + // constructed yet. + if (block_id != 0) return; + + std::vector fwd_ops, bwd_ops; + for (auto &op : all_ops) { + if (op->Type() == "while") { + fwd_ops.emplace_back(op.get()); + } else if (op->Type() == "while_grad") { + bwd_ops.emplace_back(op.get()); + } + } + PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); +} + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + const std::vector &while_ops, + const std::vector &while_grad_ops) { + std::vector fwd_ops, bwd_ops; + fwd_ops.reserve(while_ops.size()); + for (auto *op : while_ops) { + fwd_ops.emplace_back(op); + } + + bwd_ops.reserve(while_grad_ops.size()); + for (auto *op : while_grad_ops) { + bwd_ops.emplace_back(op); + } + + PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..456ba8642b9bd32a1236d112cc8b387ae6a279d3 --- /dev/null +++ b/paddle/fluid/operators/controlflow/while_op_helper.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace operators { + +static constexpr char kStepBlock[] = "sub_block"; +static constexpr char kCondition[] = "Condition"; +static constexpr char kStepScopes[] = "StepScopes"; +static constexpr char kX[] = "X"; +static constexpr char kXGRAD[] = "X@GRAD"; +static constexpr char kOutputs[] = "Out"; +static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + int block_id, + const std::vector> &all_ops); + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + const std::vector &while_ops, + const std::vector &while_grad_ops); + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index ca6bc4df0fe2c6cddaf548d3e708e777172a0841..619e12e6ba7c73e46beafadd50770aedfb52c964 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_op.h" +#include #include #include @@ -194,6 +195,12 @@ void Conv2DOpMaker::Make() { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr("use_quantizer", + "(bool, default false) " + "Set to true for operators that should be quantized and use " + "int8 kernel. " + "Only used on CPU.") + .SetDefault(false); AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); AddAttr("fuse_residual_connection", @@ -448,13 +455,13 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( return type; } -class Conv2dGradMaker : public framework::SingleGradOpDescMaker { +class Conv2DGradMaker : public framework::SingleGradOpDescMaker { public: using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; std::unique_ptr Apply() const override { auto* op = new framework::OpDesc(); - op->SetType(GradOpType()); + op->SetType(this->ForwardOpType() + "_grad"); op->SetInput("Input", Input("Input")); op->SetInput("Filter", Input("Filter")); op->SetInput("Bias", Input("Bias")); @@ -463,14 +470,33 @@ class Conv2dGradMaker : public framework::SingleGradOpDescMaker { op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); - op->SetAttrMap(Attrs()); return std::unique_ptr(op); } +}; + +class Conv3DGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType(this->ForwardOpType() + "_grad"); + op->SetInput("Input", Input("Input")); + op->SetInput("Filter", Input("Filter")); + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); - virtual std::string GradOpType() const { - return this->ForwardOpType() + "_grad"; + if (ForwardOp().Inputs().count("ResidualData") != 0) { + op->SetInput("ResidualData", Input("ResidualData")); + } + + op->SetAttrMap(Attrs()); + + return std::unique_ptr(op); } }; @@ -479,17 +505,16 @@ class Conv2dGradMaker : public framework::SingleGradOpDescMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker, - ops::ConvOpInferVarType, ops::Conv2dGradMaker); + ops::ConvOpInferVarType, ops::Conv2DGradMaker); REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad); // depthwise convolution op REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, - ops::ConvOpInferVarType, ops::Conv2dGradMaker); + ops::ConvOpInferVarType, ops::Conv2DGradMaker); REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad); REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker, - ops::ConvOpInferVarType, - paddle::framework::DefaultGradOpDescMaker); + ops::ConvOpInferVarType, ops::Conv3DGradMaker); REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad); // depthwise conv kernel diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index c994c6f642d286d9b52ada667058b064ff242ce6..baa39c0f9926efc233f9a228e055e2eb2116dbcc 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/conv_transpose_op.h" +#include #include #include @@ -344,6 +345,28 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( ctx.GetPlace(), layout_, library_); } +class ConvTransposeGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType(ForwardOp().Type() + "_grad"); + op->SetInput("Input", Input("Input")); + op->SetInput("Filter", Input("Filter")); + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); + if (ForwardOp().Inputs().count("Bias") > 0) { + op->SetInput("Bias", Input("Bias")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + } + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle @@ -352,7 +375,7 @@ namespace ops = paddle::operators; // conv2d_transpose REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ConvTransposeGradOpDescMaker); REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad); REGISTER_OP_CPU_KERNEL( @@ -368,7 +391,7 @@ REGISTER_OP_CPU_KERNEL( // conv3d_transpose REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ConvTransposeGradOpDescMaker); REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad); REGISTER_OP_CPU_KERNEL( @@ -384,7 +407,7 @@ REGISTER_OP_CPU_KERNEL( // depthwise conv2d_transpose REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ConvTransposeGradOpDescMaker); REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc index 8f3644039f9950a8a70e2fd66c20837a5f52bd7f..30ec74d8442d2f42510220b825988b340f79d0a2 100644 --- a/paddle/fluid/operators/cos_sim_op.cc +++ b/paddle/fluid/operators/cos_sim_op.cc @@ -74,6 +74,9 @@ class CosSimOpMaker : public framework::OpProtoAndCheckerMaker { "Norm of the second input, reduced along the 1st " "dimension.") .AsIntermediate(); + AddAttr(framework::kAllKernelsMustComputeRuntimeShape, + "Skip calling InferShape() function in the runtime.") + .SetDefault(true); AddComment(R"DOC( **Cosine Similarity Operator** diff --git a/paddle/fluid/operators/cos_sim_op.h b/paddle/fluid/operators/cos_sim_op.h index 76cfc680518a3caaa68abc48cedf82ce7d21c8b8..0b4e3f774674112ddc268ba911e1df317d5edcca 100644 --- a/paddle/fluid/operators/cos_sim_op.h +++ b/paddle/fluid/operators/cos_sim_op.h @@ -28,17 +28,21 @@ class CosSimKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { // get Tensor - auto* in_x = context.Input("X"); + auto* in_x = context.Input("X"); auto* in_y = context.Input("Y"); - auto* out_z = context.Output("Out"); + auto* out_z = context.Output("Out"); auto* out_x_norm = context.Output("XNorm"); auto* out_y_norm = context.Output("YNorm"); - out_z->mutable_data(context.GetPlace()); - out_x_norm->mutable_data(context.GetPlace()); - out_y_norm->mutable_data(context.GetPlace()); int rows_x = in_x->dims()[0]; int rows_y = in_y->dims()[0]; + out_z->Resize({rows_x, 1}); + out_x_norm->Resize({rows_x, 1}); + out_y_norm->Resize({rows_y, 1}); + out_z->mutable_data(context.GetPlace()); + out_x_norm->mutable_data(context.GetPlace()); + out_y_norm->mutable_data(context.GetPlace()); + out_z->set_lod(in_x->lod()); int cols = framework::product(in_x->dims()) / rows_x; @@ -81,6 +85,7 @@ class CosSimGradKernel : public framework::OpKernel { if (rows_x == rows_y) { if (out_grad_x) { + out_grad_x->Resize(in_x->dims()); math::CosSimGradFunctor functor( in_x_norm->data(), in_y_norm->data(), in_x->data(), in_y->data(), in_z->data(), in_grad_z->data(), @@ -91,6 +96,7 @@ class CosSimGradKernel : public framework::OpKernel { for_range(functor); } if (out_grad_y) { + out_grad_y->Resize(in_y->dims()); math::CosSimGradFunctor functor( in_y_norm->data(), in_x_norm->data(), in_y->data(), in_x->data(), in_z->data(), in_grad_z->data(), @@ -102,6 +108,7 @@ class CosSimGradKernel : public framework::OpKernel { } } else { if (out_grad_x) { + out_grad_x->Resize(in_x->dims()); math::CosSimDxFunctor functor( in_x_norm->data(), in_y_norm->data(), in_x->data(), in_y->data(), in_z->data(), in_grad_z->data(), @@ -112,6 +119,7 @@ class CosSimGradKernel : public framework::OpKernel { for_range(functor); } if (out_grad_y) { + out_grad_y->Resize(in_y->dims()); out_grad_y->mutable_data(context.GetPlace()); math::SetConstant set_zero; auto& dev_ctx = context.template device_context(); diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h index 72774a878d98b431da05cf870139752421b2df8d..d6b54038ec5648c72d606a6c7b9c8356cb74521b 100644 --- a/paddle/fluid/operators/crf_decoding_op.h +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -82,8 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel { Tensor track; int* track_value = track.mutable_data(emission_dims, platform::CPUPlace()); - auto ker = jit::Get, - platform::CPUPlace>(tag_num); + auto ker = + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(tag_num); ker(static_cast(seq_len), x, w, alpha_value, track_value, tag_num); T max_score = -std::numeric_limits::max(); int max_i = 0; diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc index 97d20681b8136c13d512c0b86a7ff15b24367db2..78fcd07e1df8d590ad2a4508bbc82477d928c6e9 100644 --- a/paddle/fluid/operators/crop_op.cc +++ b/paddle/fluid/operators/crop_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/crop_op.h" -#include +#include +#include +#include namespace paddle { namespace operators { @@ -178,12 +180,31 @@ class CropOpGrad : public framework::OperatorWithKernel { } }; +class CropGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("crop_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("X", Input("X")); + if (ForwardOp().Inputs().count("Offsets") > 0) { + op->SetInput("Offsets", Input("Offsets")); + } + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::CropGradOpDescMaker); REGISTER_OPERATOR(crop_grad, ops::CropOpGrad); REGISTER_OP_CPU_KERNEL( crop, ops::CropKernel); diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index 3adc7baebddd06ced74afea1e77017beb57582e8..ad32de53e7019b438b7106ddd031a8f00bd79b5d 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -13,18 +13,21 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cross_entropy_op.h" +#include #include +#include namespace paddle { namespace operators { -class CrossEntropyOp : public framework::OperatorWithKernel { +class CrossEntropyOpBase : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); auto x_dims = ctx->GetInputDim("X"); @@ -43,7 +46,8 @@ class CrossEntropyOp : public framework::OperatorWithKernel { "Input(X) and Input(Label) shall have the same shape " "except the last dimension."); } - if (ctx->Attrs().Get("soft_label")) { + + if (IsSoftLabel(ctx)) { if (check) { PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1], "If Attr(soft_label) == true, the last dimension of " @@ -69,21 +73,24 @@ class CrossEntropyOp : public framework::OperatorWithKernel { return framework::OpKernelType(ctx.Input("X")->type(), ctx.device_context()); } + + virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const { + return ctx->Attrs().Get("soft_label"); + } }; -class CrossEntropyGradientOp : public framework::OperatorWithKernel { +class CrossEntropyGradientOpBase : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + void InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "Input(Y@GRAD) shoudl be not null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Output(X@GRAD) should be not null."); - auto x_dims = ctx->GetInputDim("X"); + auto x_dims = GetXDim(ctx); auto label_dims = ctx->GetInputDim("Label"); auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); int rank = x_dims.size(); @@ -108,9 +115,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { "The Input(X) and Input(Y@Grad) should have the same " "shape except the last dimension."); } - PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, - "The last dimension of Input(Y@Grad) should be 1."); - if (ctx->Attrs().Get("soft_label")) { + if (IsSoftLabel(ctx)) { if (check) { PADDLE_ENFORCE_EQ( x_dims[rank - 1], label_dims[rank - 1], @@ -123,7 +128,10 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { "Input(Label) should be 1."); } ctx->SetOutputDim(framework::GradVarName("X"), x_dims); - ctx->ShareLoD("X", framework::GradVarName("X")); + PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, + "The last dimension of Input(Y@Grad) should be 1."); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD(VarNameWithXLoD(), framework::GradVarName("X")); } protected: @@ -131,8 +139,28 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { // is determined by its input "X". framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Y"))->type(), + ctx.device_context()); + } + + virtual framework::DDim GetXDim(framework::InferShapeContext* ctx) const { + return ctx->GetInputDim("X"); + } + + virtual const char* VarNameWithXLoD() const { return "X"; } + + virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const { + return ctx->Attrs().Get("soft_label"); + } +}; + +class CrossEntropyOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Y"}}; } }; @@ -200,26 +228,164 @@ or not. But the output only shares the LoD information with input X. } }; -class CrossEntropyOpInferVarType - : public framework::PassInDtypeAndVarTypeToOutput { +class CrossEntropyGradientOp : public CrossEntropyGradientOpBase { + public: + using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + CrossEntropyGradientOpBase::InferShape(ctx); + } +}; + +class CrossEntropyGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + protected: - std::unordered_map GetInputOutputWithSameType() - const override { - return std::unordered_map{{"X", /*->*/ "Y"}}; + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("cross_entropy_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Label", Input("Label")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +class CrossEntropyOp2 : public CrossEntropyOpBase { + public: + using CrossEntropyOpBase::CrossEntropyOpBase; + + void InferShape(framework::InferShapeContext* ctx) const override { + CrossEntropyOpBase::InferShape(ctx); + + PADDLE_ENFORCE(ctx->HasOutput("XShape"), + "Output(XShape) should be not null."); + + PADDLE_ENFORCE(ctx->HasOutput("MatchX"), + "Output(MatchX) should be not null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_dims_vec = framework::vectorize(x_dims); + x_dims_vec.push_back(0); + ctx->SetOutputDim("XShape", framework::make_ddim(x_dims_vec)); + x_dims[x_dims.size() - 1] = 1; + ctx->SetOutputDim("MatchX", x_dims); + ctx->ShareLoD("X", /*->*/ "XShape"); + } + + protected: + bool IsSoftLabel(framework::InferShapeContext* ctx) const override { + return false; } }; + +class CrossEntropyGradientOp2 : public CrossEntropyGradientOpBase { + public: + using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("MatchX"), "Input(MatchX) must exist"); + CrossEntropyGradientOpBase::InferShape(ctx); + } + + protected: + virtual framework::DDim GetXDim(framework::InferShapeContext* ctx) const { + auto x_shape = ctx->GetInputDim("XShape"); + return framework::DDim(x_shape.Get(), x_shape.size() - 1); + } + + virtual const char* VarNameWithXLoD() const { return "XShape"; } + + virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const { + return false; + } +}; + +class CrossEntropyOpMaker2 : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), a tensor whose last dimension " + "size is equal to the number of classes. This input is a " + "probability computed by the previous operator, which is almost " + "always the result of a softmax operator."); + AddInput( + "Label", + "(Tensor), the tensor which represents the ground truth. It has the " + "same shape with 'X' except the last dimension. One hot Tensor."); + AddOutput("Y", + "(Tensor, default Tensor), a tensor whose shape is same " + "with 'X' except that the last dimension size is 1. It " + "represents the cross entropy loss."); + AddOutput("XShape", "Temporaily variable to save shape and LoD of X."); + AddOutput("MatchX", + "X value that matches label, used for gradient computation."); + AddAttr("ignore_index", + "(int, default -100), Specifies a target value that is" + "ignored and does not contribute to the input gradient." + "Only valid if soft_label is set to False") + .SetDefault(-100); + AddComment(R"DOC( +Hard-label CrossEntropy Operator. + +The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. +The matrix's second dimension(row length) is as same as the original last +dimension, and the first dimension(column length) is the product of all other +original dimensions. Then the softmax computation will take palce on each raw +of flattened matrixs. + +Only support hard label. + +Both the input X and Label can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD information with input X. + +)DOC"); + } +}; + +class CrossEntropyGradOpDescMaker2 : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("cross_entropy_grad2"); + op->SetInput("Label", Input("Label")); + op->SetInput("MatchX", Output("MatchX")); + op->SetInput("XShape", Output("XShape")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; using CPUCtx = paddle::platform::CPUDeviceContext; -REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker, - ops::CrossEntropyOpInferVarType, - paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOpBase, + ops::CrossEntropyOpMaker, ops::CrossEntropyOpInferVarType, + ops::CrossEntropyGradOpDescMaker); REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp); REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel, ops::CrossEntropyOpKernel); REGISTER_OP_CPU_KERNEL(cross_entropy_grad, ops::CrossEntropyGradientOpKernel, ops::CrossEntropyGradientOpKernel); + +REGISTER_OPERATOR(cross_entropy2, ops::CrossEntropyOp2, + ops::CrossEntropyOpMaker2, ops::CrossEntropyOpInferVarType, + ops::CrossEntropyGradOpDescMaker2); +REGISTER_OPERATOR(cross_entropy_grad2, ops::CrossEntropyGradientOp2); +REGISTER_OP_CPU_KERNEL(cross_entropy2, + ops::CrossEntropyOpKernel2, + ops::CrossEntropyOpKernel2); +REGISTER_OP_CPU_KERNEL(cross_entropy_grad2, + ops::CrossEntropyGradientOpKernel2, + ops::CrossEntropyGradientOpKernel2); diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu index fcd34383a85f6984a8f27ce0625364f8fd5e31d6..243e7f52c1e3c4c210e91f708ae5d6de97e4afbc 100644 --- a/paddle/fluid/operators/cross_entropy_op.cu +++ b/paddle/fluid/operators/cross_entropy_op.cu @@ -27,3 +27,13 @@ REGISTER_OP_CUDA_KERNEL( cross_entropy_grad, ops::CrossEntropyGradientOpKernel, ops::CrossEntropyGradientOpKernel, ops::CrossEntropyGradientOpKernel); + +REGISTER_OP_CUDA_KERNEL(cross_entropy2, + ops::CrossEntropyOpKernel2, + ops::CrossEntropyOpKernel2, + ops::CrossEntropyOpKernel2); + +REGISTER_OP_CUDA_KERNEL( + cross_entropy_grad2, ops::CrossEntropyGradientOpKernel2, + ops::CrossEntropyGradientOpKernel2, + ops::CrossEntropyGradientOpKernel2); diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h index f123e11542d85c904a81fe2a87f59ab52511cc15..7eb663773ed072760c47a2914377b5306ceeb7af 100644 --- a/paddle/fluid/operators/cross_entropy_op.h +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/for_range.h" @@ -137,5 +138,124 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { } }; +template +struct HardLabelCrossEntropyForwardFunctor { + HardLabelCrossEntropyForwardFunctor(const T* x, T* y, T* match_x, + const int64_t* label, + int64_t ignore_index, + int64_t feature_size) + : x_(x), + y_(y), + match_x_(match_x), + label_(label), + ignore_index_(ignore_index), + feature_size_(feature_size) {} + + HOSTDEVICE void operator()(int64_t idx) const { + auto label = label_[idx]; + if (label != ignore_index_) { + auto match_x = x_[idx * feature_size_ + label]; + y_[idx] = -math::TolerableValue()(real_log(match_x)); + match_x_[idx] = match_x; + } else { + y_[idx] = 0; + match_x_[idx] = 0; // any value is ok + } + } + + const T* x_; + T* y_; + T* match_x_; + const int64_t* label_; + int64_t ignore_index_; + int64_t feature_size_; +}; + +template +struct HardLabelCrossEntropyBackwardFunctor { + HardLabelCrossEntropyBackwardFunctor(T* dx, const T* dy, const T* match_x, + const int64_t* label, + int64_t ignore_index, + int64_t feature_size) + : dx_(dx), + dy_(dy), + match_x_(match_x), + label_(label), + ignore_index_(ignore_index), + feature_size_(feature_size) {} + + HOSTDEVICE void operator()(int64_t idx) const { + auto row_idx = idx / feature_size_; + auto col_idx = idx % feature_size_; + auto label = label_[row_idx]; + if (label == col_idx && label != ignore_index_) { + dx_[idx] = -dy_[row_idx] / match_x_[row_idx]; + } else { + dx_[idx] = 0; + } + } + + T* dx_; + const T* dy_; + const T* match_x_; + const int64_t* label_; + int64_t ignore_index_; + int64_t feature_size_; +}; + +template +class CrossEntropyOpKernel2 : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* label = ctx.Input("Label"); + auto* y = ctx.Output("Y"); + auto* match_x = ctx.Output("MatchX"); + + auto& x_dims = x->dims(); + auto feature_size = x_dims[x_dims.size() - 1]; + auto batch_size = framework::product(x->dims()) / feature_size; + + auto* p_x = x->data(); + auto* p_label = label->data(); + auto* p_y = y->mutable_data(ctx.GetPlace()); + auto* p_match_x = match_x->mutable_data(ctx.GetPlace()); + + auto ignore_index = ctx.Attr("ignore_index"); + + platform::ForRange for_range( + ctx.template device_context(), batch_size); + for_range(HardLabelCrossEntropyForwardFunctor( + p_x, p_y, p_match_x, p_label, ignore_index, feature_size)); + } +}; + +template +class CrossEntropyGradientOpKernel2 : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Input(framework::GradVarName("Y")); + auto* match_x = ctx.Input("MatchX"); + auto* label = ctx.Input("Label"); + + auto* p_dx = dx->mutable_data(ctx.GetPlace()); + auto* p_dy = dy->data(); + auto* p_match_x = match_x->data(); + auto* p_label = label->data(); + + int64_t ignore_index = ctx.Attr("ignore_index"); + int rank = dx->dims().size(); + int64_t feature_size = dx->dims()[rank - 1]; + int64_t batch_size = framework::product(dx->dims()) / feature_size; + + platform::ForRange for_range( + ctx.template device_context(), + batch_size * feature_size); + for_range(HardLabelCrossEntropyBackwardFunctor( + p_dx, p_dy, p_match_x, p_label, ignore_index, feature_size)); + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc index e63d57be57a66e8e02f7ef88acd01246302bc53c..134f84d59cafa661fce727adc3303444c4ef483e 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include "paddle/fluid/framework/op_registry.h" @@ -170,11 +171,6 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("Input"), "Input(Input) of LSTM should not be null."); PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("last_h"), - "Input(last_h) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("last_c"), - "Input(last_c) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Cache"), "Input(last_c) of LSTM should not be null."); PADDLE_ENFORCE(ctx->HasInput("InitH"), @@ -197,6 +193,35 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel { } }; +class CudnnLSTMGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("cudnn_lstm_grad"); + op->SetInput("Input", Input("Input")); + op->SetInput("InitH", Input("InitH")); + op->SetInput("InitC", Input("InitC")); + op->SetInput("W", Input("W")); + if (ForwardOp().Inputs().count("Cache") > 0) { + op->SetInput("Cache", Input("Cache")); + } + op->SetInput("Out", Output("Out")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput(framework::GradVarName("last_c"), OutputGrad("last_c")); + op->SetInput(framework::GradVarName("last_h"), OutputGrad("last_h")); + + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + op->SetOutput(framework::GradVarName("W"), InputGrad("W")); + op->SetOutput(framework::GradVarName("InitH"), InputGrad("InitH")); + op->SetOutput(framework::GradVarName("InitC"), InputGrad("InitC")); + op->SetAttrMap(Attrs()); + return op; + } +}; + template class NotImpleKernel : public framework::OpKernel { public: @@ -211,7 +236,7 @@ class NotImpleKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::CudnnLSTMGradOpDescMaker); REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp); REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel); diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 933a28f3f909ad48dc5a62b9b04beccbe60e7d6b..94a2016aa53212c3ae5af6d86cccb117855cc3b4 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -33,12 +33,15 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu) detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc) +detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu) detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu) if(WITH_GPU) detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub) + detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub) else() detection_library(generate_proposals_op SRCS generate_proposals_op.cc) + detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc) endif() detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu) diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc index 0a51d50e06176e713922837861f2102c9ee8a899..de3612677440596387f313e1ff59184cb3fdb7ae 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cc +++ b/paddle/fluid/operators/detection/box_coder_op.cc @@ -60,14 +60,15 @@ class BoxCoderOp : public framework::OperatorWithKernel { } else if (code_type == BoxCodeType::kDecodeCenterSize) { PADDLE_ENFORCE_EQ(target_box_dims.size(), 3, "The rank of Input TargetBox must be 3"); - if (axis == 0) { - PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]); - } else if (axis == 1) { - PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]); - } else { - PADDLE_THROW("axis must be 0 or 1."); + PADDLE_ENFORCE(axis == 0 || axis == 1, "axis must be 0 or 1"); + if (ctx->IsRuntime()) { + if (axis == 0) { + PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]); + } else if (axis == 1) { + PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]); + } + PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]); } - PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]); ctx->ShareDim("TargetBox", /*->*/ "OutputBox"); } diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h index 6d406f8196f9964c85bb94541fa7a7a23857539b..d4c7e8cf7723bf83d3cd8bf36b9ae6c5f1c35b10 100644 --- a/paddle/fluid/operators/detection/box_coder_op.h +++ b/paddle/fluid/operators/detection/box_coder_op.h @@ -20,7 +20,7 @@ namespace operators { enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 }; -inline BoxCodeType GetBoxCodeType(const std::string& type) { +inline BoxCodeType GetBoxCodeType(const std::string &type) { if (type == "encode_center_size") { return BoxCodeType::kEncodeCenterSize; } else if (type == "decode_center_size") { @@ -32,24 +32,23 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) { template class BoxCoderKernel : public framework::OpKernel { public: - void EncodeCenterSize(const framework::Tensor* target_box, - const framework::Tensor* prior_box, - const framework::Tensor* prior_box_var, + void EncodeCenterSize(const framework::Tensor *target_box, + const framework::Tensor *prior_box, + const framework::Tensor *prior_box_var, const bool normalized, - const std::vector variance, T* output) const { + const std::vector variance, T *output) const { int64_t row = target_box->dims()[0]; int64_t col = prior_box->dims()[0]; int64_t len = prior_box->dims()[1]; - auto* target_box_data = target_box->data(); - auto* prior_box_data = prior_box->data(); - const T* prior_box_var_data = nullptr; - if (prior_box_var) prior_box_var_data = prior_box_var->data(); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for collapse(2) #endif for (int64_t i = 0; i < row; ++i) { for (int64_t j = 0; j < col; ++j) { + auto *target_box_data = target_box->data(); + auto *prior_box_data = prior_box->data(); + size_t offset = i * col * len + j * len; T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len] + (normalized == false); T prior_box_height = prior_box_data[j * len + 3] - @@ -69,7 +68,6 @@ class BoxCoderKernel : public framework::OpKernel { target_box_data[i * len + 1] + (normalized == false); - size_t offset = i * col * len + j * len; output[offset] = (target_box_center_x - prior_box_center_x) / prior_box_width; output[offset + 1] = @@ -78,44 +76,61 @@ class BoxCoderKernel : public framework::OpKernel { std::log(std::fabs(target_box_width / prior_box_width)); output[offset + 3] = std::log(std::fabs(target_box_height / prior_box_height)); - if (prior_box_var) { - int prior_var_offset = j * len; - output[offset] /= prior_box_var_data[prior_var_offset]; - output[offset + 1] /= prior_box_var_data[prior_var_offset + 1]; - output[offset + 2] /= prior_box_var_data[prior_var_offset + 2]; - output[offset + 3] /= prior_box_var_data[prior_var_offset + 3]; - } else if (!(variance.empty())) { + } + } + + if (prior_box_var) { + const T *prior_box_var_data = prior_box_var->data(); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(3) +#endif + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { for (int k = 0; k < 4; ++k) { + size_t offset = i * col * len + j * len; + int prior_var_offset = j * len; + output[offset + k] /= prior_box_var_data[prior_var_offset + k]; + } + } + } + } else if (!(variance.empty())) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(3) +#endif + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + for (int k = 0; k < 4; ++k) { + size_t offset = i * col * len + j * len; output[offset + k] /= static_cast(variance[k]); } } } } } + template - void DecodeCenterSize(const framework::Tensor* target_box, - const framework::Tensor* prior_box, - const framework::Tensor* prior_box_var, + void DecodeCenterSize(const framework::Tensor *target_box, + const framework::Tensor *prior_box, + const framework::Tensor *prior_box_var, const bool normalized, std::vector variance, - T* output) const { + T *output) const { int64_t row = target_box->dims()[0]; int64_t col = target_box->dims()[1]; int64_t len = target_box->dims()[2]; - auto* target_box_data = target_box->data(); - auto* prior_box_data = prior_box->data(); - const T* prior_box_var_data = nullptr; - if (var_size == 2) prior_box_var_data = prior_box_var->data(); - int prior_box_offset = 0; - T var_data[4] = {1., 1., 1., 1.}; - T* var_ptr = var_data; #ifdef PADDLE_WITH_MKLML #pragma omp parallel for collapse(2) #endif for (int64_t i = 0; i < row; ++i) { for (int64_t j = 0; j < col; ++j) { + auto *target_box_data = target_box->data(); + auto *prior_box_data = prior_box->data(); + + T var_data[4] = {1., 1., 1., 1.}; + T *var_ptr = var_data; size_t offset = i * col * len + j * len; - prior_box_offset = axis == 0 ? j * len : i * len; + int prior_box_offset = axis == 0 ? j * len : i * len; + T prior_box_width = prior_box_data[prior_box_offset + 2] - prior_box_data[prior_box_offset] + (normalized == false); @@ -131,10 +146,10 @@ class BoxCoderKernel : public framework::OpKernel { T target_box_width = 0, target_box_height = 0; int prior_var_offset = axis == 0 ? j * len : i * len; if (var_size == 2) { - std::memcpy(var_ptr, prior_box_var_data + prior_var_offset, + std::memcpy(var_ptr, prior_box_var->data() + prior_var_offset, 4 * sizeof(T)); } else if (var_size == 1) { - var_ptr = reinterpret_cast(variance.data()); + var_ptr = reinterpret_cast(variance.data()); } T box_var_x = *var_ptr; T box_var_y = *(var_ptr + 1); @@ -162,11 +177,11 @@ class BoxCoderKernel : public framework::OpKernel { } } - void Compute(const framework::ExecutionContext& context) const override { - auto* prior_box = context.Input("PriorBox"); - auto* prior_box_var = context.Input("PriorBoxVar"); - auto* target_box = context.Input("TargetBox"); - auto* output_box = context.Output("OutputBox"); + void Compute(const framework::ExecutionContext &context) const override { + auto *prior_box = context.Input("PriorBox"); + auto *prior_box_var = context.Input("PriorBoxVar"); + auto *target_box = context.Input("TargetBox"); + auto *output_box = context.Output("OutputBox"); std::vector variance = context.Attr>("variance"); const int axis = context.Attr("axis"); if (target_box->lod().size()) { @@ -194,7 +209,7 @@ class BoxCoderKernel : public framework::OpKernel { output_box->mutable_data({row, col, len}, context.GetPlace()); - T* output = output_box->data(); + T *output = output_box->data(); if (code_type == BoxCodeType::kEncodeCenterSize) { EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, variance, output); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6d36876efd747d9e6f90c0d0200a9e9610a5318c --- /dev/null +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc @@ -0,0 +1,93 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h" + +namespace paddle { +namespace operators { + +class DistributeFpnProposalsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("FpnRois"), + "Input(FpnRois) shouldn't be null"); + PADDLE_ENFORCE_GE( + ctx->Outputs("MultiFpnRois").size(), 1UL, + "Outputs(MultiFpnRois) of DistributeOp should not be empty"); + size_t min_level = static_cast(ctx->Attrs().Get("min_level")); + size_t max_level = static_cast(ctx->Attrs().Get("max_level")); + PADDLE_ENFORCE_GE(max_level, min_level, + "max_level must not lower than min_level"); + // Set the output shape + size_t num_out_rois = max_level - min_level + 1; + std::vector outs_dims; + outs_dims.reserve(num_out_rois); + for (size_t i = 0; i < num_out_rois; ++i) { + framework::DDim out_dim = {-1, 4}; + outs_dims.push_back(out_dim); + } + ctx->SetOutputsDim("MultiFpnRois", outs_dims); + ctx->SetOutputDim("RestoreIndex", {1, -1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("FpnRois")); + return framework::OpKernelType(data_type, platform::CPUPlace()); + } +}; + +class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("FpnRois", "(LoDTensor) The rois at all levels in shape (-1, 4)"); + AddOutput("MultiFpnRois", "(LoDTensor) Output with distribute operator") + .AsDuplicable(); + AddOutput("RestoreIndex", + "(Tensor) An array of positive number which is " + "used to restore the order of FpnRois"); + AddAttr("min_level", + "The lowest level of FPN layer where the" + " proposals come from"); + AddAttr("max_level", + "The highest level of FPN layer where the" + " proposals come from"); + AddAttr("refer_level", + "The referring level of FPN layer with" + " specified scale"); + AddAttr("refer_scale", + "The referring scale of FPN layer with" + " specified level"); + AddComment(R"DOC( +This operator distribute all proposals into different fpn level, + with respect to scale of the proposals, the referring scale and + the referring level. Besides, to restore the order of proposals, +we return an array which indicate the original index of rois in + current proposals. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(distribute_fpn_proposals, ops::DistributeFpnProposalsOp, + ops::DistributeFpnProposalsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(distribute_fpn_proposals, + ops::DistributeFpnProposalsOpKernel, + ops::DistributeFpnProposalsOpKernel); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..9cbb969158386547485fad54120510595eb92804 --- /dev/null +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -0,0 +1,221 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "cub/cub.cuh" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h" +#include "paddle/fluid/operators/gather.cu.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +int const BBoxSize = 4; + +struct RangeInitFunctor { + int start_; + int delta_; + int* out_; + __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; } +}; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +static inline void TransLoD(const int* length_lod, const int lod_size, + int* offset_lod) { + int offset = 0; + for (int i = 0; i < lod_size; ++i) { + offset_lod[i] = offset; + offset += length_lod[i]; + } +} + +template +static __device__ inline T RoIArea(const T* box, bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static __global__ void GPUDistFpnProposalsHelper( + const int nthreads, const T* rois, const int lod_size, + const int refer_level, const int refer_scale, const int max_level, + const int min_level, int* roi_batch_id_data, int* sub_lod_list, + int* target_lvls) { + CUDA_1D_KERNEL_LOOP(i, nthreads) { + const T* offset_roi = rois + i * BBoxSize; + int roi_batch_ind = roi_batch_id_data[i]; + // get the target level of current rois + T roi_area = RoIArea(offset_roi, false); + T roi_scale = sqrt(roi_area); + int tgt_lvl = floor(log2(roi_scale / refer_scale) + refer_level); + tgt_lvl = min(max_level, max(tgt_lvl, min_level)); + target_lvls[i] = tgt_lvl; + // compute number of rois in the same batch and same target level + platform::CudaAtomicAdd(sub_lod_list + tgt_lvl * lod_size + roi_batch_ind, + 1); + } +} + +template +class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* fpn_rois = ctx.Input("FpnRois"); + + auto multi_fpn_rois = ctx.MultiOutput("MultiFpnRois"); + auto* restore_index = ctx.Output("RestoreIndex"); + + const int min_level = ctx.Attr("min_level"); + const int max_level = ctx.Attr("max_level"); + const int refer_level = ctx.Attr("refer_level"); + const int refer_scale = ctx.Attr("refer_scale"); + int num_level = max_level - min_level + 1; + + // check that the fpn_rois is not empty + PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL, + "DistributeFpnProposalsOp need 1 level of LoD"); + + auto fpn_rois_lod = fpn_rois->lod().back(); + int lod_size = fpn_rois_lod.size() - 1; + int roi_num = fpn_rois_lod[lod_size]; + + auto& dev_ctx = ctx.template device_context(); + + // get batch id by lod in CPU + Tensor roi_batch_id_list; + roi_batch_id_list.Resize({roi_num}); + int* roi_batch_id_data = + roi_batch_id_list.mutable_data(platform::CPUPlace()); + for (int n = 0; n < lod_size; ++n) { + for (size_t i = fpn_rois_lod[n]; i < fpn_rois_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + // copy batch id list to GPU + Tensor roi_batch_id_list_gpu; + framework::TensorCopySync(roi_batch_id_list, dev_ctx.GetPlace(), + &roi_batch_id_list_gpu); + + Tensor sub_lod_list; + sub_lod_list.Resize({num_level, lod_size}); + int* sub_lod_list_data = sub_lod_list.mutable_data(dev_ctx.GetPlace()); + Tensor target_lvls; + target_lvls.Resize({roi_num}); + int* target_lvls_data = target_lvls.mutable_data(dev_ctx.GetPlace()); + + int blocks = NumBlocks(roi_num); + int threads = kNumCUDAThreads; + + // get target levels and sub_lod list + GPUDistFpnProposalsHelper<<>>( + roi_num, fpn_rois->data(), lod_size, refer_level, refer_scale, + max_level, min_level, roi_batch_id_list_gpu.data(), + sub_lod_list_data, target_lvls_data); + + Tensor index_in_t; + int* idx_in = index_in_t.mutable_data({roi_num}, dev_ctx.GetPlace()); + platform::ForRange for_range(dev_ctx, roi_num); + for_range(RangeInitFunctor{0, 1, idx_in}); + + Tensor keys_out_t; + int* keys_out = keys_out_t.mutable_data({roi_num}, dev_ctx.GetPlace()); + Tensor index_out_t; + int* idx_out = index_out_t.mutable_data({roi_num}, dev_ctx.GetPlace()); + + // Determine temporary device storage requirements + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairsDescending( + nullptr, temp_storage_bytes, target_lvls_data, keys_out, idx_in, + idx_out, roi_num); + // Allocate temporary storage + auto place = boost::get(dev_ctx.GetPlace()); + auto d_temp_storage = memory::Alloc(place, temp_storage_bytes, + memory::Allocator::kScratchpad); + + // Run sorting operation + // sort target level to get corresponding index + cub::DeviceRadixSort::SortPairsDescending( + d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out, + idx_in, idx_out, roi_num); + + int* restore_idx_data = + restore_index->mutable_data({roi_num, 1}, dev_ctx.GetPlace()); + // sort current index to get restore index + cub::DeviceRadixSort::SortPairsDescending( + d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in, + restore_idx_data, roi_num); + + Tensor offset_lod; + int* offset_lod_data = + offset_lod.mutable_data({lod_size + 1}, dev_ctx.GetPlace()); + for (int i = 0; i < num_level; ++i) { + Tensor sub_lod = sub_lod_list.Slice(i, i + 1); + int* sub_lod_data = sub_lod.data(); + // transfer length-based lod to offset-based lod + TransLoD(sub_lod_data, lod_size + 1, offset_lod_data); + int sub_rois_num = offset_lod_data[lod_size]; + Tensor sub_idx = index_out_t.Slice(0, sub_rois_num); + + multi_fpn_rois[i]->mutable_data({sub_rois_num, kBoxDim}, + dev_ctx.GetPlace()); + + GPUGather(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]); + framework::LoD lod; + std::vector offset; + memory::Copy(platform::CPUPlace(), offset.data(), place, offset_lod_data, + sizeof(int) * (lod_size + 1), 0); + lod.emplace_back(offset); + multi_fpn_rois[i]->set_lod(lod); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + distribute_fpn_proposals, + ops::GPUDistributeFpnProposalsOpKernel, + ops::GPUDistributeFpnProposalsOpKernel); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f63e856626d64ec13476c3f967a085624a007c3a --- /dev/null +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h @@ -0,0 +1,147 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +const int kBoxDim = 4; + +template +static inline T BBoxArea(const T* box, bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +class DistributeFpnProposalsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* fpn_rois = context.Input("FpnRois"); + + auto multi_fpn_rois = + context.MultiOutput("MultiFpnRois"); + + auto* restore_index = + context.Output("RestoreIndex"); + + const int min_level = context.Attr("min_level"); + const int max_level = context.Attr("max_level"); + const int refer_level = context.Attr("refer_level"); + const int refer_scale = context.Attr("refer_scale"); + const int num_level = max_level - min_level + 1; + + // check that the fpn_rois is not empty + PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL, + "DistributeFpnProposalsOp need 1 level of LoD"); + + auto fpn_rois_lod = fpn_rois->lod().back(); + int fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1]; + std::vector target_level; + // std::vector target_level(fpn_rois_num, -1); + // record the number of rois in each level + std::vector num_rois_level(num_level, 0); + std::vector num_rois_level_integral(num_level + 1, 0); + for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) { + Tensor fpn_rois_slice = + fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]); + const T* rois_data = fpn_rois_slice.data(); + for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) { + // get the target level of current rois + T roi_scale = std::sqrt(BBoxArea(rois_data, false)); + int tgt_lvl = + std::floor(std::log2(roi_scale / refer_scale) + refer_level); + tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level)); + target_level.push_back(tgt_lvl); + num_rois_level[tgt_lvl - min_level]++; + rois_data += kBoxDim; + } + } + // define the output rois + // pointer which point to each level fpn rois + std::vector multi_fpn_rois_data(num_level); + // lod0 which will record the offset information of each level rois + std::vector> multi_fpn_rois_lod0; + for (int i = 0; i < num_level; ++i) { + // allocate memory for each level rois + multi_fpn_rois[i]->mutable_data({num_rois_level[i], kBoxDim}, + context.GetPlace()); + multi_fpn_rois_data[i] = multi_fpn_rois[i]->data(); + std::vector lod0(1, 0); + multi_fpn_rois_lod0.push_back(lod0); + // statistic start point for each level rois + num_rois_level_integral[i + 1] = + num_rois_level_integral[i] + num_rois_level[i]; + } + restore_index->mutable_data({1, fpn_rois_num}, context.GetPlace()); + int* restore_index_data = restore_index->data(); + std::vector restore_index_inter(fpn_rois_num, -1); + // distribute the rois into different fpn level by target level + for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) { + Tensor fpn_rois_slice = + fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]); + const T* rois_data = fpn_rois_slice.data(); + size_t cur_offset = fpn_rois_lod[i]; + // std::vector lod_offset[num_level]; + for (int j = 0; j < num_level; j++) { + multi_fpn_rois_lod0[j].push_back(multi_fpn_rois_lod0[j][i]); + } + for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) { + int lvl = target_level[cur_offset + j]; + memcpy(multi_fpn_rois_data[lvl - min_level], rois_data, + kBoxDim * sizeof(T)); + multi_fpn_rois_data[lvl - min_level] += kBoxDim; + int index_in_shuffle = num_rois_level_integral[lvl - min_level] + + multi_fpn_rois_lod0[lvl - min_level][i + 1]; + restore_index_inter[index_in_shuffle] = cur_offset + j; + multi_fpn_rois_lod0[lvl - min_level][i + 1]++; + rois_data += kBoxDim; + } + } + for (int i = 0; i < fpn_rois_num; ++i) { + restore_index_data[restore_index_inter[i]] = i; + } + // merge lod information into LoDTensor + for (int i = 0; i < num_level; ++i) { + framework::LoD lod; + lod.emplace_back(multi_fpn_rois_lod0[i]); + multi_fpn_rois[i]->set_lod(lod); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e0d7e25d944cf2321799da4c73de9f74d9fd287d --- /dev/null +++ b/paddle/fluid/operators/detection/yolo_box_op.cc @@ -0,0 +1,167 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/detection/yolo_box_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class YoloBoxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of YoloBoxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ImgSize"), + "Input(ImgSize) of YoloBoxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Boxes"), + "Output(Boxes) of YoloBoxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Scores"), + "Output(Scores) of YoloBoxOp should not be null."); + + auto dim_x = ctx->GetInputDim("X"); + auto dim_imgsize = ctx->GetInputDim("ImgSize"); + auto anchors = ctx->Attrs().Get>("anchors"); + int anchor_num = anchors.size() / 2; + auto class_num = ctx->Attrs().Get("class_num"); + + PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor."); + PADDLE_ENFORCE_EQ( + dim_x[1], anchor_num * (5 + class_num), + "Input(X) dim[1] should be equal to (anchor_mask_number * (5 " + "+ class_num))."); + PADDLE_ENFORCE_EQ(dim_imgsize.size(), 2, + "Input(ImgSize) should be a 2-D tensor."); + PADDLE_ENFORCE_EQ( + dim_imgsize[0], dim_x[0], + "Input(ImgSize) dim[0] and Input(X) dim[0] should be same."); + PADDLE_ENFORCE_EQ(dim_imgsize[1], 2, "Input(ImgSize) dim[1] should be 2."); + PADDLE_ENFORCE_GT(anchors.size(), 0, + "Attr(anchors) length should be greater than 0."); + PADDLE_ENFORCE_EQ(anchors.size() % 2, 0, + "Attr(anchors) length should be even integer."); + PADDLE_ENFORCE_GT(class_num, 0, + "Attr(class_num) should be an integer greater than 0."); + + int box_num = dim_x[2] * dim_x[3] * anchor_num; + std::vector dim_boxes({dim_x[0], box_num, 4}); + ctx->SetOutputDim("Boxes", framework::make_ddim(dim_boxes)); + + std::vector dim_scores({dim_x[0], box_num, class_num}); + ctx->SetOutputDim("Scores", framework::make_ddim(dim_scores)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); + } +}; + +class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input tensor of YoloBox operator is a 4-D tensor with " + "shape of [N, C, H, W]. The second dimension(C) stores " + "box locations, confidence score and classification one-hot " + "keys of each anchor box. Generally, X should be the output " + "of YOLOv3 network."); + AddInput("ImgSize", + "The image size tensor of YoloBox operator, " + "This is a 2-D tensor with shape of [N, 2]. This tensor holds " + "height and width of each input image used for resizing output " + "box in input image scale."); + AddOutput("Boxes", + "The output tensor of detection boxes of YoloBox operator, " + "This is a 3-D tensor with shape of [N, M, 4], N is the " + "batch num, M is output box number, and the 3rd dimension " + "stores [xmin, ymin, xmax, ymax] coordinates of boxes."); + AddOutput("Scores", + "The output tensor of detection boxes scores of YoloBox " + "operator, This is a 3-D tensor with shape of " + "[N, M, :attr:`class_num`], N is the batch num, M is " + "output box number."); + + AddAttr("class_num", "The number of classes to predict."); + AddAttr>("anchors", + "The anchor width and height, " + "it will be parsed pair by pair.") + .SetDefault(std::vector{}); + AddAttr("downsample_ratio", + "The downsample ratio from network input to YoloBox operator " + "input, so 32, 16, 8 should be set for the first, second, " + "and thrid YoloBox operators.") + .SetDefault(32); + AddAttr("conf_thresh", + "The confidence scores threshold of detection boxes. " + "Boxes with confidence scores under threshold should " + "be ignored.") + .SetDefault(0.01); + AddComment(R"DOC( + This operator generates YOLO detection boxes from output of YOLOv3 network. + + The output of previous network is in shape [N, C, H, W], while H and W + should be the same, H and W specify the grid size, each grid point predict + given number boxes, this given number, which following will be represented as S, + is specified by the number of anchors. In the second dimension(the channel + dimension), C should be equal to S * (5 + class_num), class_num is the object + category number of source dataset(such as 80 in coco dataset), so the + second(channel) dimension, apart from 4 box location coordinates x, y, w, h, + also includes confidence score of the box and class one-hot key of each anchor + box. + + Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box + predictions should be as follows: + + $$ + b_x = \\sigma(t_x) + c_x + $$ + $$ + b_y = \\sigma(t_y) + c_y + $$ + $$ + b_w = p_w e^{t_w} + $$ + $$ + b_h = p_h e^{t_h} + $$ + + in the equation above, :math:`c_x, c_y` is the left top corner of current grid + and :math:`p_w, p_h` is specified by anchors. + + The logistic regression value of the 5th channel of each anchor prediction boxes + represents the confidence score of each prediction box, and the logistic + regression value of the last :attr:`class_num` channels of each anchor prediction + boxes represents the classifcation scores. Boxes with confidence scores less than + :attr:`conf_thresh` should be ignored, and box final scores is the product of + confidence scores and classification scores. + + $$ + score_{pred} = score_{conf} * score_{class} + $$ + + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel, + ops::YoloBoxKernel); diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..5a882958e66a79507e053a96b15be8cbbcc83164 --- /dev/null +++ b/paddle/fluid/operators/detection/yolo_box_op.cu @@ -0,0 +1,120 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/yolo_box_op.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes, + T* scores, const float conf_thresh, + const int* anchors, const int n, const int h, + const int w, const int an_num, const int class_num, + const int box_num, int input_size) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + T box[4]; + for (; tid < n * box_num; tid += stride) { + int grid_num = h * w; + int i = tid / box_num; + int j = (tid % box_num) / grid_num; + int k = (tid % grid_num) / w; + int l = tid % w; + + int an_stride = (5 + class_num) * grid_num; + int img_height = imgsize[2 * i]; + int img_width = imgsize[2 * i + 1]; + + int obj_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4); + T conf = sigmoid(input[obj_idx]); + if (conf < conf_thresh) { + continue; + } + + int box_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0); + GetYoloBox(box, input, anchors, l, k, j, h, input_size, box_idx, + grid_num, img_height, img_width); + box_idx = (i * box_num + j * grid_num + k * w + l) * 4; + CalcDetectionBox(boxes, box, box_idx, img_height, img_width); + + int label_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5); + int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; + CalcLabelScore(scores, input, label_idx, score_idx, class_num, conf, + grid_num); + } +} + +template +class YoloBoxOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* img_size = ctx.Input("ImgSize"); + auto* boxes = ctx.Output("Boxes"); + auto* scores = ctx.Output("Scores"); + + auto anchors = ctx.Attr>("anchors"); + int class_num = ctx.Attr("class_num"); + float conf_thresh = ctx.Attr("conf_thresh"); + int downsample_ratio = ctx.Attr("downsample_ratio"); + + const int n = input->dims()[0]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int box_num = boxes->dims()[1]; + const int an_num = anchors.size() / 2; + int input_size = downsample_ratio * h; + + auto& dev_ctx = ctx.cuda_device_context(); + auto& allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + int bytes = sizeof(int) * anchors.size(); + auto anchors_ptr = allocator.Allocate(sizeof(int) * anchors.size()); + int* anchors_data = reinterpret_cast(anchors_ptr->ptr()); + const auto gplace = boost::get(ctx.GetPlace()); + const auto cplace = platform::CPUPlace(); + memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes, + dev_ctx.stream()); + + const T* input_data = input->data(); + const int* imgsize_data = img_size->data(); + T* boxes_data = boxes->mutable_data({n, box_num, 4}, ctx.GetPlace()); + T* scores_data = + scores->mutable_data({n, box_num, class_num}, ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(dev_ctx, boxes, static_cast(0)); + set_zero(dev_ctx, scores, static_cast(0)); + + int grid_dim = (n * box_num + 512 - 1) / 512; + grid_dim = grid_dim > 8 ? 8 : grid_dim; + + KeYoloBoxFw<<>>( + input_data, imgsize_data, boxes_data, scores_data, conf_thresh, + anchors_data, n, h, w, an_num, class_num, box_num, input_size); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel, + ops::YoloBoxOpCUDAKernel); diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8b7c7df0f3cf754f59c994dbe5b1cc2ac5fb773b --- /dev/null +++ b/paddle/fluid/operators/detection/yolo_box_op.h @@ -0,0 +1,149 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +HOSTDEVICE inline T sigmoid(T x) { + return 1.0 / (1.0 + std::exp(-x)); +} + +template +HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i, + int j, int an_idx, int grid_size, + int input_size, int index, int stride, + int img_height, int img_width) { + box[0] = (i + sigmoid(x[index])) * img_width / grid_size; + box[1] = (j + sigmoid(x[index + stride])) * img_height / grid_size; + box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width / + input_size; + box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] * + img_height / input_size; +} + +HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx, + int an_num, int an_stride, int stride, + int entry) { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; +} + +template +HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx, + const int img_height, + const int img_width) { + boxes[box_idx] = box[0] - box[2] / 2; + boxes[box_idx + 1] = box[1] - box[3] / 2; + boxes[box_idx + 2] = box[0] + box[2] / 2; + boxes[box_idx + 3] = box[1] + box[3] / 2; + + boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast(0); + boxes[box_idx + 1] = + boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast(0); + boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1 + ? boxes[box_idx + 2] + : static_cast(img_width - 1); + boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 + ? boxes[box_idx + 3] + : static_cast(img_height - 1); +} + +template +HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input, + const int label_idx, const int score_idx, + const int class_num, const T conf, + const int stride) { + for (int i = 0; i < class_num; i++) { + scores[score_idx + i] = conf * sigmoid(input[label_idx + i * stride]); + } +} + +template +class YoloBoxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* imgsize = ctx.Input("ImgSize"); + auto* boxes = ctx.Output("Boxes"); + auto* scores = ctx.Output("Scores"); + auto anchors = ctx.Attr>("anchors"); + int class_num = ctx.Attr("class_num"); + float conf_thresh = ctx.Attr("conf_thresh"); + int downsample_ratio = ctx.Attr("downsample_ratio"); + + const int n = input->dims()[0]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int box_num = boxes->dims()[1]; + const int an_num = anchors.size() / 2; + int input_size = downsample_ratio * h; + + const int stride = h * w; + const int an_stride = (class_num + 5) * stride; + + Tensor anchors_; + auto anchors_data = + anchors_.mutable_data({an_num * 2}, ctx.GetPlace()); + std::copy(anchors.begin(), anchors.end(), anchors_data); + + const T* input_data = input->data(); + const int* imgsize_data = imgsize->data(); + T* boxes_data = boxes->mutable_data({n, box_num, 4}, ctx.GetPlace()); + memset(boxes_data, 0, boxes->numel() * sizeof(T)); + T* scores_data = + scores->mutable_data({n, box_num, class_num}, ctx.GetPlace()); + memset(scores_data, 0, scores->numel() * sizeof(T)); + + T box[4]; + for (int i = 0; i < n; i++) { + int img_height = imgsize_data[2 * i]; + int img_width = imgsize_data[2 * i + 1]; + + for (int j = 0; j < an_num; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + int obj_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 4); + T conf = sigmoid(input_data[obj_idx]); + if (conf < conf_thresh) { + continue; + } + + int box_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0); + GetYoloBox(box, input_data, anchors_data, l, k, j, h, input_size, + box_idx, stride, img_height, img_width); + box_idx = (i * box_num + j * stride + k * w + l) * 4; + CalcDetectionBox(boxes_data, box, box_idx, img_height, + img_width); + + int label_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5); + int score_idx = (i * box_num + j * stride + k * w + l) * class_num; + CalcLabelScore(scores_data, input_data, label_idx, score_idx, + class_num, conf, stride); + } + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc index ab01bdf7ca8c5a369bd8838b1acc734364666992..6c37da17f4011d38efcdc5406331f1be173dd0dd 100644 --- a/paddle/fluid/operators/detection/yolov3_loss_op.cc +++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc @@ -10,6 +10,7 @@ limitations under the License. */ #include "paddle/fluid/operators/detection/yolov3_loss_op.h" +#include #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -72,6 +73,18 @@ class Yolov3LossOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_GT(class_num, 0, "Attr(class_num) should be an integer greater then 0."); + if (ctx->HasInput("GTScore")) { + auto dim_gtscore = ctx->GetInputDim("GTScore"); + PADDLE_ENFORCE_EQ(dim_gtscore.size(), 2, + "Input(GTScore) should be a 2-D tensor"); + PADDLE_ENFORCE_EQ( + dim_gtscore[0], dim_gtbox[0], + "Input(GTBox) and Input(GTScore) dim[0] should be same"); + PADDLE_ENFORCE_EQ( + dim_gtscore[1], dim_gtbox[1], + "Input(GTBox) and Input(GTScore) dim[1] should be same"); + } + std::vector dim_out({dim_x[0]}); ctx->SetOutputDim("Loss", framework::make_ddim(dim_out)); @@ -112,6 +125,12 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { "This is a 2-D tensor with shape of [N, max_box_num], " "and each element should be an integer to indicate the " "box class id."); + AddInput("GTScore", + "The score of GTLabel, This is a 2-D tensor in same shape " + "GTLabel, and score values should in range (0, 1). This " + "input is for GTLabel score can be not 1.0 in image mixup " + "augmentation.") + .AsDispensable(); AddOutput("Loss", "The output yolov3 loss tensor, " "This is a 1-D tensor with shape of [N]"); @@ -143,6 +162,9 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("ignore_thresh", "The ignore threshold to ignore confidence loss.") .SetDefault(0.7); + AddAttr("use_label_smooth", + "Whether to use label smooth. Default True.") + .SetDefault(true); AddComment(R"DOC( This operator generates yolov3 loss based on given predict result and ground truth boxes. @@ -204,6 +226,15 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { loss = (loss_{xy} + loss_{wh}) * weight_{box} + loss_{conf} + loss_{class} $$ + + While :attr:`use_label_smooth` is set to be :attr:`True`, the classification + target will be smoothed when calculating classification loss, target of + positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of + negetive samples will be smoothed to :math:`1.0 / class\_num`. + + While :attr:`GTScore` is given, which means the mixup score of ground truth + boxes, all losses incured by a ground truth box will be multiplied by its + mixup score. )DOC"); } }; @@ -240,6 +271,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { op->SetInput("X", Input("X")); op->SetInput("GTBox", Input("GTBox")); op->SetInput("GTLabel", Input("GTLabel")); + op->SetInput("GTScore", Input("GTScore")); op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); op->SetInput("ObjectnessMask", Output("ObjectnessMask")); op->SetInput("GTMatchMask", Output("GTMatchMask")); @@ -249,6 +281,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { op->SetOutput(framework::GradVarName("X"), InputGrad("X")); op->SetOutput(framework::GradVarName("GTBox"), {}); op->SetOutput(framework::GradVarName("GTLabel"), {}); + op->SetOutput(framework::GradVarName("GTScore"), {}); return std::unique_ptr(op); } }; diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.h b/paddle/fluid/operators/detection/yolov3_loss_op.h index 8407d4e6e8f87a2e8d073c4fbda5691abe1bba68..a004b022b75174012d10ba38e5ec161830c62640 100644 --- a/paddle/fluid/operators/detection/yolov3_loss_op.h +++ b/paddle/fluid/operators/detection/yolov3_loss_op.h @@ -37,8 +37,8 @@ static T SigmoidCrossEntropy(T x, T label) { } template -static T L2Loss(T x, T y) { - return 0.5 * (y - x) * (y - x); +static T L1Loss(T x, T y) { + return std::abs(y - x); } template @@ -47,8 +47,8 @@ static T SigmoidCrossEntropyGrad(T x, T label) { } template -static T L2LossGrad(T x, T y) { - return x - y; +static T L1LossGrad(T x, T y) { + return x > y ? 1.0 : -1.0; } static int GetMaskIndex(std::vector mask, int val) { @@ -121,47 +121,49 @@ template static void CalcBoxLocationLoss(T* loss, const T* input, Box gt, std::vector anchors, int an_idx, int box_idx, int gi, int gj, int grid_size, - int input_size, int stride) { + int input_size, int stride, T score) { T tx = gt.x * grid_size - gi; T ty = gt.y * grid_size - gj; T tw = std::log(gt.w * input_size / anchors[2 * an_idx]); T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]); - T scale = (2.0 - gt.w * gt.h); + T scale = (2.0 - gt.w * gt.h) * score; loss[0] += SigmoidCrossEntropy(input[box_idx], tx) * scale; loss[0] += SigmoidCrossEntropy(input[box_idx + stride], ty) * scale; - loss[0] += L2Loss(input[box_idx + 2 * stride], tw) * scale; - loss[0] += L2Loss(input[box_idx + 3 * stride], th) * scale; + loss[0] += L1Loss(input[box_idx + 2 * stride], tw) * scale; + loss[0] += L1Loss(input[box_idx + 3 * stride], th) * scale; } template static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input, Box gt, std::vector anchors, int an_idx, int box_idx, int gi, int gj, - int grid_size, int input_size, int stride) { + int grid_size, int input_size, int stride, + T score) { T tx = gt.x * grid_size - gi; T ty = gt.y * grid_size - gj; T tw = std::log(gt.w * input_size / anchors[2 * an_idx]); T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]); - T scale = (2.0 - gt.w * gt.h); + T scale = (2.0 - gt.w * gt.h) * score; input_grad[box_idx] = SigmoidCrossEntropyGrad(input[box_idx], tx) * scale * loss; input_grad[box_idx + stride] = SigmoidCrossEntropyGrad(input[box_idx + stride], ty) * scale * loss; input_grad[box_idx + 2 * stride] = - L2LossGrad(input[box_idx + 2 * stride], tw) * scale * loss; + L1LossGrad(input[box_idx + 2 * stride], tw) * scale * loss; input_grad[box_idx + 3 * stride] = - L2LossGrad(input[box_idx + 3 * stride], th) * scale * loss; + L1LossGrad(input[box_idx + 3 * stride], th) * scale * loss; } template static inline void CalcLabelLoss(T* loss, const T* input, const int index, const int label, const int class_num, - const int stride) { + const int stride, const T pos, const T neg, + T score) { for (int i = 0; i < class_num; i++) { T pred = input[index + i * stride]; - loss[0] += SigmoidCrossEntropy(pred, (i == label) ? 1.0 : 0.0); + loss[0] += SigmoidCrossEntropy(pred, (i == label) ? pos : neg) * score; } } @@ -169,11 +171,13 @@ template static inline void CalcLabelLossGrad(T* input_grad, const T loss, const T* input, const int index, const int label, const int class_num, - const int stride) { + const int stride, const T pos, const T neg, + T score) { for (int i = 0; i < class_num; i++) { T pred = input[index + i * stride]; input_grad[index + i * stride] = - SigmoidCrossEntropyGrad(pred, (i == label) ? 1.0 : 0.0) * loss; + SigmoidCrossEntropyGrad(pred, (i == label) ? pos : neg) * score * + loss; } } @@ -188,8 +192,8 @@ static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness, for (int l = 0; l < w; l++) { T obj = objness[k * w + l]; if (obj > 1e-5) { - // positive sample: obj = 1 - loss[i] += SigmoidCrossEntropy(input[k * w + l], 1.0); + // positive sample: obj = mixup score + loss[i] += SigmoidCrossEntropy(input[k * w + l], 1.0) * obj; } else if (obj > -0.5) { // negetive sample: obj = 0 loss[i] += SigmoidCrossEntropy(input[k * w + l], 0.0); @@ -215,7 +219,8 @@ static inline void CalcObjnessLossGrad(T* input_grad, const T* loss, T obj = objness[k * w + l]; if (obj > 1e-5) { input_grad[k * w + l] = - SigmoidCrossEntropyGrad(input[k * w + l], 1.0) * loss[i]; + SigmoidCrossEntropyGrad(input[k * w + l], 1.0) * obj * + loss[i]; } else if (obj > -0.5) { input_grad[k * w + l] = SigmoidCrossEntropyGrad(input[k * w + l], 0.0) * loss[i]; @@ -252,6 +257,7 @@ class Yolov3LossKernel : public framework::OpKernel { auto* input = ctx.Input("X"); auto* gt_box = ctx.Input("GTBox"); auto* gt_label = ctx.Input("GTLabel"); + auto* gt_score = ctx.Input("GTScore"); auto* loss = ctx.Output("Loss"); auto* objness_mask = ctx.Output("ObjectnessMask"); auto* gt_match_mask = ctx.Output("GTMatchMask"); @@ -260,6 +266,7 @@ class Yolov3LossKernel : public framework::OpKernel { int class_num = ctx.Attr("class_num"); float ignore_thresh = ctx.Attr("ignore_thresh"); int downsample_ratio = ctx.Attr("downsample_ratio"); + bool use_label_smooth = ctx.Attr("use_label_smooth"); const int n = input->dims()[0]; const int h = input->dims()[2]; @@ -272,6 +279,13 @@ class Yolov3LossKernel : public framework::OpKernel { const int stride = h * w; const int an_stride = (class_num + 5) * stride; + T label_pos = 1.0; + T label_neg = 0.0; + if (use_label_smooth) { + label_pos = 1.0 - 1.0 / static_cast(class_num); + label_neg = 1.0 / static_cast(class_num); + } + const T* input_data = input->data(); const T* gt_box_data = gt_box->data(); const int* gt_label_data = gt_label->data(); @@ -283,6 +297,19 @@ class Yolov3LossKernel : public framework::OpKernel { int* gt_match_mask_data = gt_match_mask->mutable_data({n, b}, ctx.GetPlace()); + const T* gt_score_data; + if (!gt_score) { + Tensor gtscore; + gtscore.mutable_data({n, b}, ctx.GetPlace()); + math::SetConstant()( + ctx.template device_context(), >score, + static_cast(1.0)); + gt_score = >score; + gt_score_data = gtscore.data(); + } else { + gt_score_data = gt_score->data(); + } + // calc valid gt box mask, avoid calc duplicately in following code Tensor gt_valid_mask; bool* gt_valid_mask_data = @@ -355,19 +382,20 @@ class Yolov3LossKernel : public framework::OpKernel { int mask_idx = GetMaskIndex(anchor_mask, best_n); gt_match_mask_data[i * b + t] = mask_idx; if (mask_idx >= 0) { + T score = gt_score_data[i * b + t]; int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, an_stride, stride, 0); CalcBoxLocationLoss(loss_data + i, input_data, gt, anchors, best_n, - box_idx, gi, gj, h, input_size, stride); + box_idx, gi, gj, h, input_size, stride, score); int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi; - obj_mask_data[obj_idx] = 1.0; + obj_mask_data[obj_idx] = score; int label = gt_label_data[i * b + t]; int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, an_stride, stride, 5); CalcLabelLoss(loss_data + i, input_data, label_idx, label, - class_num, stride); + class_num, stride, label_pos, label_neg, score); } } } @@ -384,6 +412,7 @@ class Yolov3LossGradKernel : public framework::OpKernel { auto* input = ctx.Input("X"); auto* gt_box = ctx.Input("GTBox"); auto* gt_label = ctx.Input("GTLabel"); + auto* gt_score = ctx.Input("GTScore"); auto* input_grad = ctx.Output(framework::GradVarName("X")); auto* loss_grad = ctx.Input(framework::GradVarName("Loss")); auto* objness_mask = ctx.Input("ObjectnessMask"); @@ -392,6 +421,7 @@ class Yolov3LossGradKernel : public framework::OpKernel { auto anchor_mask = ctx.Attr>("anchor_mask"); int class_num = ctx.Attr("class_num"); int downsample_ratio = ctx.Attr("downsample_ratio"); + bool use_label_smooth = ctx.Attr("use_label_smooth"); const int n = input_grad->dims()[0]; const int c = input_grad->dims()[1]; @@ -404,6 +434,13 @@ class Yolov3LossGradKernel : public framework::OpKernel { const int stride = h * w; const int an_stride = (class_num + 5) * stride; + T label_pos = 1.0; + T label_neg = 0.0; + if (use_label_smooth) { + label_pos = 1.0 - 1.0 / static_cast(class_num); + label_neg = 1.0 / static_cast(class_num); + } + const T* input_data = input->data(); const T* gt_box_data = gt_box->data(); const int* gt_label_data = gt_label->data(); @@ -414,25 +451,41 @@ class Yolov3LossGradKernel : public framework::OpKernel { input_grad->mutable_data({n, c, h, w}, ctx.GetPlace()); memset(input_grad_data, 0, input_grad->numel() * sizeof(T)); + const T* gt_score_data; + if (!gt_score) { + Tensor gtscore; + gtscore.mutable_data({n, b}, ctx.GetPlace()); + math::SetConstant()( + ctx.template device_context(), >score, + static_cast(1.0)); + gt_score = >score; + gt_score_data = gtscore.data(); + } else { + gt_score_data = gt_score->data(); + } + for (int i = 0; i < n; i++) { for (int t = 0; t < b; t++) { int mask_idx = gt_match_mask_data[i * b + t]; if (mask_idx >= 0) { + T score = gt_score_data[i * b + t]; Box gt = GetGtBox(gt_box_data, i, b, t); int gi = static_cast(gt.x * w); int gj = static_cast(gt.y * h); int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, an_stride, stride, 0); - CalcBoxLocationLossGrad( - input_grad_data, loss_grad_data[i], input_data, gt, anchors, - anchor_mask[mask_idx], box_idx, gi, gj, h, input_size, stride); + CalcBoxLocationLossGrad(input_grad_data, loss_grad_data[i], + input_data, gt, anchors, + anchor_mask[mask_idx], box_idx, gi, gj, h, + input_size, stride, score); int label = gt_label_data[i * b + t]; int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, an_stride, stride, 5); CalcLabelLossGrad(input_grad_data, loss_grad_data[i], input_data, - label_idx, label, class_num, stride); + label_idx, label, class_num, stride, label_pos, + label_neg, score); } } } diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cc b/paddle/fluid/operators/dgc_clip_by_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6ebad4de3c8ebc57823709c04498a1f4311942a5 --- /dev/null +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/dgc_clip_by_norm_op.h" + +namespace paddle { +namespace operators { + +class DGCClipByNormOp : public ClipByNormOp { + public: + using ClipByNormOp::ClipByNormOp; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("current_step"), + "current_step should be set."); + + return ClipByNormOp::InferShape(ctx); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "current_step") { + VLOG(10) << "var_name:" << var_name << " need not to transform"; + return expected_kernel_type; + } + + return framework::OperatorWithKernel::GetKernelTypeForVar( + var_name, tensor, expected_kernel_type); + } +}; + +class DGCClipByNormOpMaker : public ClipByNormOpMaker { + public: + void Make() override { + AddInput("current_step", "(Tensor) Current step."); + AddAttr("rampup_begin_step", + "(float, -1.0)" + "The period when begin k_select.") + .SetDefault(-1.0); + + return ClipByNormOpMaker::Make(); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(dgc_clip_by_norm, ops::DGCClipByNormOp, + ops::DGCClipByNormOpMaker); + +REGISTER_OP_CPU_KERNEL( + dgc_clip_by_norm, + ops::DGCClipByNormKernel); diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cu b/paddle/fluid/operators/dgc_clip_by_norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..e7f564b7ab4d1c11810dc096faec7f5a375b8563 --- /dev/null +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/dgc_clip_by_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + dgc_clip_by_norm, + ops::DGCClipByNormKernel); diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..bd22d16f7a21877af4e78c30f7e0985c64b543f2 --- /dev/null +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/clip_by_norm_op.h" + +namespace paddle { +namespace operators { + +template +class DGCClipByNormKernel : public ClipByNormKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto rampup_begin_step = context.Attr("rampup_begin_step"); + if (static_cast(rampup_begin_step) >= 0) { + auto current_step_tensor = + context.Input("current_step"); + auto* current_step = current_step_tensor->data(); + + if (static_cast(*current_step) < + static_cast(rampup_begin_step)) { + VLOG(10) << "current_step:" << *current_step + << " < rampup_begin_step:" << rampup_begin_step + << " so does't use dgc_clip_by_norm"; + return; + } + } + + return ClipByNormKernel::Compute(context); + }; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/dgc_op.cc b/paddle/fluid/operators/dgc_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ccdeea2d0a96342a57ca56ae2b686f81b32fd866 --- /dev/null +++ b/paddle/fluid/operators/dgc_op.cc @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/dgc_op.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class DGCOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("U"), "Input(U) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasInput("V"), "Input(V) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasInput("current_step"), + "Input(current_step) of DGCop should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("U_out"), + "Output(U_out) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("V_out"), + "Output(V_out) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("k"), + "Output(k) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("EncodeGrad"), + "Output(EncodeGrad) of DGCop should not be null."); + } + + protected: + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "current_step" || var_name == "rampup_step" || + var_name == "k") { + VLOG(10) << "var_name:" << var_name << " need not to transform"; + return expected_kernel_type; + } + + return framework::OperatorWithKernel::GetKernelTypeForVar( + var_name, tensor, expected_kernel_type); + } +}; + +class DGCOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("U", "(Tensor) Middle tensor of DGC"); + AddInput("V", "(Tensor) Middle tensor of DGC"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("current_step", "(Tensor) Current step."); + + AddOutput("U_out", + "(Tensor) " + "Output encoded gradient"); + AddOutput("V_out", + "(Tensor) " + "Output encoded gradient"); + AddOutput("EncodeGrad", + "(Tensor) " + "Output encoded gradient"); + AddOutput("Grad_out", + "(Tensor) " + "Output grad gradient"); + AddOutput("k", + "(Tensor) " + "Output top-k value"); + + AddAttr("m", + "(float, 0.9) " + "The momentum of learning rate.") + .SetDefault(0.9); + + AddAttr("use_nesterov", + "(bool, true)" + "The momentum of learning rate.") + .SetDefault(true); + + AddAttr>("sparsity", + "(vecotr, float)" + "The period sparsity of k_select."); + + AddAttr("rampup_begin_step", + "(float, 0.0)" + "The period when begin k_select.") + .SetDefault(0.0); + + AddAttr("rampup_step", + "(float, 0.0)" + "The period when begin k_select."); + + AddComment(R"DOC( + Original paper is https://arxiv.org/abs/1712.01887 + + DGC reduce the communication bandwidth by sending only the important gradients (sparse update):\ + only gradients larger than a threshold are transmitted. + + To avoid losing information, DGC accumulate the rest of the gradients locally. + + Eventually, these gradients become large enough to be transmitted. + + Thus, DGC send the large gradients immediately but eventually send all of the gradients over time. + + To ensure no loss of accuracy, DGC employs momentum correc-tionandlocal gradient clipping on top of the gradient sparsification to maintain model performance. + + DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication. + + This optimizer will do two things: + + 1. Compress the gradient by get TopK import value from tensor \ + and use it for allreduce to reduce network bandwidth. + + 2. Call momentum to optimize on the cost. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(dgc, ops::DGCOp, ops::DGCOpMaker); diff --git a/paddle/fluid/operators/dgc_op.cu b/paddle/fluid/operators/dgc_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..0f0bf441a70bef9cb69362a9cf333aeb51e835b6 --- /dev/null +++ b/paddle/fluid/operators/dgc_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/dgc_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + dgc, ops::DGCOpKernel); diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8d1683bdb2d521971ffbfa8d60b138a67d7eb52c --- /dev/null +++ b/paddle/fluid/operators/dgc_op.h @@ -0,0 +1,132 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "dgc/dgc.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" + +namespace paddle { +namespace operators { + +inline float get_period_sparcity(const std::vector& sparsity, + float cur_step, float rampup_steps) { + PADDLE_ENFORCE(static_cast(cur_step) >= 0); + + size_t idx = static_cast(cur_step * sparsity.size() / rampup_steps); + if (idx >= sparsity.size()) { + return 0.999; + } + + PADDLE_ENFORCE(idx < sparsity.size()); + return sparsity[idx]; +} + +template +class DGCOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto u = ctx.Input("U"); + auto v = ctx.Input("V"); + auto g = ctx.Input("Grad"); + + // attrs + float m = ctx.Attr("m"); + bool use_nesterov = ctx.Attr("use_nesterov"); + auto sparsity = ctx.Attr>("sparsity"); + auto rampup_begin_step = ctx.Attr("rampup_begin_step"); + auto rampup_step = ctx.Attr("rampup_step"); + + // current step + auto current_step_tensor = ctx.Input("current_step"); + const float* current_step = current_step_tensor->data(); + + if (static_cast(*current_step) < static_cast(rampup_begin_step)) { + VLOG(10) << "current_step:" << *current_step + << " < rampup_begin_step:" << rampup_begin_step + << " so does't use dgc"; + return; + } + + float ratio = + 1 - get_period_sparcity(sparsity, static_cast(*current_step), + rampup_step); + PADDLE_ENFORCE(ratio > 0.0 && ratio < 1.0); + int k = static_cast(g->numel() * ratio); + + VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov + << ", rampup_begin_step:" << rampup_begin_step + << ", rampup_step:" << rampup_step + << ", current_step:" << *current_step << ", ratio:" << ratio + << ", k:" << k; + + auto k_out = ctx.Output("k"); + T* k_out_data = k_out->data(); + *k_out_data = k; + + auto u_out = ctx.Output("U_out"); + auto v_out = ctx.Output("V_out"); + auto encode_grad_out = ctx.Output("EncodeGrad"); + + // FIXME(gongwb): use cublas. + auto u_out_e = framework::EigenVector::Flatten(*u_out); + auto u_e = framework::EigenVector::Flatten(*u); + auto g_e = framework::EigenVector::Flatten(*g); + auto& dev_ctx = ctx.template device_context(); + auto& eigen_ctx = *dev_ctx.eigen_device(); + if (use_nesterov) { + // u = m * (u + g) + u_out_e.device(eigen_ctx) = m * (u_e + g_e); + + // v = u + v + g + ElementwiseComputeEx, DeviceContext, T>( + ctx, u, v, 0, AddFunctor(), v_out); + + ElementwiseComputeEx, DeviceContext, T>( + ctx, g, v, 0, AddFunctor(), v_out); + } else { + // u = m * u + g + u_out_e.device(eigen_ctx) = m * u_e + g_e; + + // v = u + v + ElementwiseComputeEx, DeviceContext, T>( + ctx, u, v, 0, AddFunctor(), v_out); + } + + T* v_out_data = v_out->mutable_data(ctx.GetPlace()); + T* u_out_data = u_out->mutable_data(ctx.GetPlace()); + T* encode_grad_out_data = encode_grad_out->mutable_data( + framework::DDim{2 * k}, ctx.GetPlace()); + + int buf_size = paddle::communication::dgc::get_buffer_size(k); + auto& allocator = platform::DeviceTemporaryAllocator::Instance().Get( + ctx.GetPlace(), dev_ctx.stream()); + auto tmp_ious_data = allocator.Allocate(buf_size); + void* buf = reinterpret_cast(tmp_ious_data->ptr()); + + if (!paddle::communication::dgc::k_select( + static_cast(encode_grad_out_data), k, v_out_data, + static_cast(v_out->numel()), buf, dev_ctx.stream(), + u_out_data)) { + LOG(FATAL) << "v_out numel:" << v_out->numel(); + } + + auto grad_out = ctx.Output("Grad_out"); + math::SetConstant tset; + tset(dev_ctx, grad_out, static_cast(0)); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index c63d65348880ebb4085d83059d9fead6456216d7..65295c2c103ceca50d9de3ae314246256497d084 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -14,6 +14,7 @@ #include #include +#include #include #include "paddle/fluid/operators/distributed/parameter_prefetch.h" @@ -218,7 +219,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, boost::get(id_tensor.place()), id_tensor.data(), sizeof(int64_t) * id_tensor.numel(), stream); - for (size_t i = 0; i < cpu_tensor.numel(); ++i) { + for (int64_t i = 0; i < cpu_tensor.numel(); ++i) { ids_vector.push_back(cpu_tensor_data[i]); } #endif diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.cc b/paddle/fluid/operators/distributed_ops/allreduce_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0fbc27515cec9f7982852954055aa929f678a096 --- /dev/null +++ b/paddle/fluid/operators/distributed_ops/allreduce_op.cc @@ -0,0 +1,143 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include // NOLINT +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +struct MutableDataFunctor { + MutableDataFunctor(void** data, framework::LoDTensor* tensor, + const platform::Place& place) + : data_(data), tensor_(tensor), place_(place) {} + + template + void apply() { + *data_ = tensor_->mutable_data(place_); + } + + void** data_; + framework::LoDTensor* tensor_; + platform::Place place_; +}; + +class AllReduceOp : public framework::OperatorBase { + using OperatorBase::OperatorBase; + + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override { + PADDLE_ENFORCE(is_gpu_place(place), + "AllReduce op can run on gpu place only for now."); +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* ctx = pool.Get(place); + auto in_names = Inputs("X"); + auto out_names = Outputs("Out"); + PADDLE_ENFORCE_EQ(in_names.size(), 1, "Only support one input"); + PADDLE_ENFORCE_EQ(out_names.size(), 1, "Only support one output"); + + auto* in = scope.FindVar(in_names[0]); + auto* out = scope.FindVar(out_names[0]); + + PADDLE_ENFORCE(in->IsType() || + out->IsType(), + "Only support allreduce LoDTensors"); + + int dtype = -1; + auto in_tensor = in->Get(); + dtype = platform::ToNCCLDataType(in_tensor.type()); + + int64_t numel = in_tensor.numel(); + auto* sendbuff = in_tensor.data(); + auto* out_tensor = out->GetMutable(); + out_tensor->Resize(in_tensor.dims()); + void* recvbuff = nullptr; + framework::VisitDataType(in_tensor.type(), + MutableDataFunctor(&recvbuff, out_tensor, place)); + + auto cuda_ctx = static_cast(ctx); + auto* comm = cuda_ctx->nccl_comm(); + // FIXME(typhoonzero): should use nccl stream here. + auto stream = cuda_ctx->stream(); + + int reduce_type = Attr("reduce_type"); + ncclRedOp_t red_type = ncclSum; + switch (reduce_type) { + case 0: + red_type = ncclSum; + break; + case 1: + red_type = ncclProd; + break; + case 2: + red_type = ncclMax; + break; + case 3: + red_type = ncclMin; + break; + } + + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, static_cast(dtype), red_type, + comm, stream)); +#endif + } +}; + +class AllReduceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor), tensor to be allreduced."); + AddOutput("Out", "(Tensor) the result of allreduced."); + AddAttr("reduce_type", "(int) determin the reduce type.") + .SetDefault(0); + AddComment(R"DOC( +***AllReduce Operator*** + +Call NCCL AllReduce internally. Note that this op must be used when one +thread is managing one GPU device. + +For speed reasons, reduce_type should be an integer: + +0: sum +1: prod +2: max +3: min + +If input and output are the same variable, in-place allreduce will be used. +)DOC"); + } +}; + +class AllReduceOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override {} +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(allreduce, ops::AllReduceOp, + paddle::framework::EmptyGradOpMaker, ops::AllReduceOpMaker, + ops::AllReduceOpShapeInference); diff --git a/paddle/fluid/operators/distributed_ops/fake_init_op.cc b/paddle/fluid/operators/distributed_ops/fake_init_op.cc index 28ebdcb03ea83f3ec701106111a7cc5f0f7ed7dc..5ee35e0458a64dacc1c469a435edd28de1b78e6b 100644 --- a/paddle/fluid/operators/distributed_ops/fake_init_op.cc +++ b/paddle/fluid/operators/distributed_ops/fake_init_op.cc @@ -56,8 +56,7 @@ class FakeInitOp : public framework::OperatorBase { class FakeInitOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override {} + void operator()(framework::InferVarTypeContext *ctx) const override {} }; class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc index da0185b8c492eeb694902b46c871c44cd060d438..1b0b4dd31693340bc39c0da8995a2a2d40b13e00 100644 --- a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc +++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc @@ -114,11 +114,10 @@ class MergeIdsOp : public framework::OperatorWithKernel { class MergeIdsOpInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - auto *input_var = block->Var(op_desc.Input("Ids")[0]); - for (auto &out_var : op_desc.Output("Out")) { - block->Var(out_var)->SetType(input_var->GetType()); + void operator()(framework::InferVarTypeContext *ctx) const override { + auto input_type = ctx->GetType(ctx->Input("Ids")[0]); + for (auto &out_var : ctx->Output("Out")) { + ctx->SetType(out_var, input_type); } } }; diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.cc b/paddle/fluid/operators/distributed_ops/split_ids_op.cc index f61d387fbef636298c412c227bf7a56a04f69c63..191ca1efe8ca5798ddbd38968eafde349af8a7d1 100644 --- a/paddle/fluid/operators/distributed_ops/split_ids_op.cc +++ b/paddle/fluid/operators/distributed_ops/split_ids_op.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed_ops/split_ids_op.h" +#include + namespace paddle { namespace operators { @@ -71,11 +73,10 @@ class SplitIdsOp : public framework::OperatorWithKernel { class SplitIdsOpInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - auto *input_var = block->Var(op_desc.Input("Ids")[0]); - for (auto &out_var : op_desc.Output("Out")) { - block->Var(out_var)->SetType(input_var->GetType()); + void operator()(framework::InferVarTypeContext *ctx) const override { + auto input_type = ctx->GetType(ctx->Input("Ids")[0]); + for (auto &out_var : ctx->Output("Out")) { + ctx->SetType(out_var, input_type); } } }; diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 2ccc86c1dc04a3afeb02b24677e6ebce40cca4fa..65c2ff6415c1d51fdc05d6014da589678761b676 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/dropout_op.h" +#include #include namespace paddle { @@ -70,7 +71,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { "1. downgrade_in_infer(default), downgrade the outcome at inference " "time" " train: out = input * mask" - " inference: out = input * dropout_prob" + " inference: out = input * (1.0 - dropout_prob)" "2. upscale_in_train, upscale the outcome at training time, do nothing " "in inference" " train: out = input * mask / ( 1.0 - dropout_prob )" @@ -106,21 +107,31 @@ class DropoutOpGrad : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx->Attrs().Get("is_test"), false, "GradOp is only callable when is_test is false"); - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); PADDLE_ENFORCE(ctx->HasInput("Mask"), "Mask must not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) must not be null."); - auto x_dims = ctx->GetInputDim("X"); auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); - PADDLE_ENFORCE_EQ(x_dims, out_dims, - "Dimensions of Input(X) and Out@Grad must be the same."); - auto mask_dims = ctx->GetInputDim("Mask"); - PADDLE_ENFORCE_EQ(x_dims, mask_dims, - "Dimensions of Input(X) and Mask must be the same."); - - ctx->SetOutputDim(framework::GradVarName("X"), x_dims); - ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + + ctx->SetOutputDim(framework::GradVarName("X"), out_dims); + ctx->ShareLoD(framework::GradVarName("Out"), + /*->*/ framework::GradVarName("X")); + } +}; + +class DropoutGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("dropout_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("Mask", Output("Mask")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; } }; @@ -129,7 +140,7 @@ class DropoutOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::DropoutGradOpDescMaker); REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad); REGISTER_OP_CPU_KERNEL( dropout, ops::CPUDropoutKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index c6c658236c235f0a6767924026b0a7610071e918..2b3fc06dcb79b8c6b46de7abf51bdb2c47acca1c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -16,8 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op.h" namespace ops = paddle::operators; REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add); -REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out", - "X"); +REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y"); REGISTER_OP_CPU_KERNEL( elementwise_add, diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..66c56da417487e3b2ee94ad572d83a971958ab62 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc @@ -0,0 +1,38 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h" +#include +#include "paddle/fluid/operators/elementwise/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwiseFloorDivOpMaker : public ElementwiseOpMaker { + protected: + std::string GetName() const override { return "FloorDiv"; } + std::string GetEquation() const override { return "Out = X // Y"; } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp, + ops::ElementwiseFloorDivOpMaker); + +REGISTER_OP_CPU_KERNEL( + elementwise_floordiv, + ops::ElementwiseFloorDivKernel, + ops::ElementwiseFloorDivKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..60846d1e8fee1c7f68ac101f18355750c2c15a4d --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + elementwise_floordiv, + ops::ElementwiseFloorDivKernel, + ops::ElementwiseFloorDivKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h new file mode 100644 index 0000000000000000000000000000000000000000..2d24e394d5c823dbd22c837210e46cefeceba1be --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/fluid/operators/math/blas.h" + +namespace paddle { +namespace operators { + +template +struct FloorDivFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a / b; } +}; + +template +void elementwise_floor_div(const framework::ExecutionContext &ctx, + const framework::Tensor *x, + const framework::Tensor *y, framework::Tensor *z) { + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>( + ctx, x, y, axis, FloorDivFunctor(), z); +} + +template +class ElementwiseFloorDivKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *x = ctx.Input("X"); + auto *y = ctx.Input("Y"); + auto *z = ctx.Output("Out"); + + z->mutable_data(ctx.GetPlace()); + + // dtype of x and y is int64 or int32 + elementwise_floor_div(ctx, x, y, z); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d63a7df03d0de7489a507825b066ab365e1ef8b9 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h" +#include +#include "paddle/fluid/operators/elementwise/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwiseModOpMaker : public ElementwiseOpMaker { + protected: + std::string GetName() const override { return "Mod"; } + std::string GetEquation() const override { return "Out = X % Y"; } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(elementwise_mod, ops::ElementwiseOp, + ops::ElementwiseModOpMaker); + +REGISTER_OP_CPU_KERNEL( + elementwise_mod, + ops::ElementwiseModKernel, + ops::ElementwiseModKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..da3304a83952d448ffcad61f1878b06d354168b9 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + elementwise_mod, ops::ElementwiseModKernel, + ops::ElementwiseModKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.h b/paddle/fluid/operators/elementwise/elementwise_mod_op.h new file mode 100644 index 0000000000000000000000000000000000000000..5b139fd4b33152b4a340c6c5a0f094338bbdffc8 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/fluid/operators/math/blas.h" + +namespace paddle { +namespace operators { + +template +struct ModFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a % b; } +}; + +template +void elementwise_mod(const framework::ExecutionContext &ctx, + const framework::Tensor *x, const framework::Tensor *y, + framework::Tensor *z) { + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + ModFunctor(), z); +} + +template +class ElementwiseModKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *x = ctx.Input("X"); + auto *y = ctx.Input("Y"); + auto *z = ctx.Output("Out"); + + z->mutable_data(ctx.GetPlace()); + + // dtype of x and y is int64 or int32 + elementwise_mod(ctx, x, y, z); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 91e44152658d87750f0b6d5826c481904085e086..6dbb9072495f743a4df1ff05e029a227c2cf618b 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once +#include #include +#include #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -250,43 +252,31 @@ class ElemwiseGradKernel : public framework::OpKernel { } }; -class ElementwiseOpInplace : public framework::InplaceInToOut { +class ElementwiseOpInplace : public framework::InplaceOpInference { public: - using framework::InplaceInToOut::InplaceInToOut; - - protected: - std::unordered_map Apply( - const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { + std::unordered_map operator()( + const framework::OpDesc &op_desc) const override { return std::unordered_map{ {"X", "Out"}, }; } }; -class ElementwiseGradOpInplace : public framework::InplaceInToOut { +class ElementwiseGradOpInplace : public framework::InplaceOpInference { public: - using framework::InplaceInToOut::InplaceInToOut; - - protected: - std::unordered_map Apply( - const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - std::unordered_map ret; - if (block->HasVar(framework::GradVarName("X")) && - block->HasVar(framework::GradVarName("Out"))) { - ret[framework::GradVarName("Out")] = framework::GradVarName("X"); - } - return ret; + std::unordered_map operator()( + const framework::OpDesc &op_desc) const override { + return std::unordered_map{ + {framework::GradVarName("Out"), framework::GradVarName("X")}, + }; } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseGradNoBufVarsInference, "Y"); + } // namespace operators } // namespace paddle -/* -*/ - #define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name) \ class kernel_type##GradMaker \ : public paddle::framework::SingleGradOpDescMaker { \ @@ -320,18 +310,19 @@ class ElementwiseGradOpInplace : public framework::InplaceInToOut { ::paddle::framework::DefaultGradOpDescMaker); \ REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad) -#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation, ...) \ - class __ElemwiseOp##op_type##Maker__ \ - : public ::paddle::operators::ElementwiseOpMaker { \ - protected: \ - virtual std::string GetName() const { return op_name; } \ - virtual std::string GetEquation() const { return equation; } \ - }; \ - REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \ - __ElemwiseOp##op_type##Maker__, \ - ::paddle::operators::ElementwiseOpInferVarType, \ - op_type##GradMaker, \ - ::paddle::operators::ElementwiseOpInplace); \ - REGISTER_OPERATOR(op_type##_grad, \ - ::paddle::operators::ElementwiseOpExplicitGrad, \ - ::paddle::operators::ElementwiseGradOpInplace) +#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation) \ + class __ElemwiseOp##op_type##Maker__ \ + : public ::paddle::operators::ElementwiseOpMaker { \ + protected: \ + virtual std::string GetName() const { return op_name; } \ + virtual std::string GetEquation() const { return equation; } \ + }; \ + REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \ + __ElemwiseOp##op_type##Maker__, \ + ::paddle::operators::ElementwiseOpInferVarType, \ + op_type##GradMaker, \ + ::paddle::operators::ElementwiseOpInplace); \ + REGISTER_OPERATOR(op_type##_grad, \ + ::paddle::operators::ElementwiseOpExplicitGrad, \ + ::paddle::operators::ElementwiseGradOpInplace, \ + ::paddle::operators::ElementwiseGradNoBufVarsInference) diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc index efc66374c812cbd07adef6ac25c9616b880ec383..04c87c1b2ac398f8f75265c80bef5326aea15dce 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc @@ -16,8 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op.h" namespace ops = paddle::operators; REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub); -REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y", "Out", - "X"); +REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y"); REGISTER_OP_CPU_KERNEL( elementwise_sub, diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index 04e8800bbc888540c4df21360c767688eb19c423..f2f4d3fee053a1e5bacd3c2165dba960f3befea4 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -110,8 +110,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { constexpr int simd_width = 16; int C = c / simd_width; - auto multiply = jit::Get, - platform::CPUPlace>(0); + auto multiply = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(0); #pragma omp parallel for collapse(2) for (int ni = 0; ni < n; ni++) { for (int ci = 0; ci < C; ci++) { diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc index 44a2f37b66772425a835c26e94c37b500e8a5d19..fcb2be93635eeaeaae25c3a845fd06aa1a73e2e7 100644 --- a/paddle/fluid/operators/expand_op.cc +++ b/paddle/fluid/operators/expand_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/expand_op.h" +#include #include namespace paddle { @@ -138,12 +139,28 @@ class ExpandGradOp : public framework::OperatorWithKernel { } }; +class ExpandGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("expand_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ExpandGradOpDescMaker); REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp); REGISTER_OP_CPU_KERNEL( expand, ops::ExpandKernel, diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc index 5d6488c67e0db440c8d4609736523643dd666dcc..4a8937ba1c7ef9827ecc9bf575d9893c95a3b22b 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cc +++ b/paddle/fluid/operators/fake_dequantize_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fake_dequantize_op.h" #include +#include namespace paddle { namespace operators { @@ -32,8 +33,51 @@ struct DequantizeFunctor { } }; +template +struct ChannelDequantizeFunctor { + void operator()(const platform::CPUDeviceContext& dev_ctx, + const framework::Tensor* in, const framework::Tensor** scales, + const int scale_num, T max_range, framework::Tensor* out) { + if (scale_num == 1) { + const int channel = in->dims()[0]; + const T* scale_factor = scales[0]->data(); + for (int i = 0; i < channel; i++) { + T s = scale_factor[i]; + framework::Tensor one_channel_in = in->Slice(i, i + 1); + framework::Tensor one_channel_out = out->Slice(i, i + 1); + auto in_e = framework::EigenVector::Flatten(one_channel_in); + auto out_e = framework::EigenVector::Flatten(one_channel_out); + auto& dev = *dev_ctx.eigen_device(); + out_e.device(dev) = (s / max_range) * in_e; + } + } else if (scale_num == 2) { + int batch_size = in->dims()[0]; + int channel = in->dims()[1]; + const T* scale_one = scales[0]->data(); + const T* scale_two = scales[1]->data(); + for (int i = 0; i < batch_size; i++) { + framework::Tensor one_batch_in = in->Slice(i, i + 1).Resize( + framework::slice_ddim(in->dims(), 1, in->dims().size())); + framework::Tensor one_batch_out = out->Slice(i, i + 1).Resize( + framework::slice_ddim(out->dims(), 1, out->dims().size())); + for (int j = 0; j < channel; j++) { + T s = scale_one[j]; + framework::Tensor one_channel_in = one_batch_in.Slice(j, j + 1); + framework::Tensor one_channel_out = one_batch_out.Slice(j, j + 1); + auto in_e = framework::EigenVector::Flatten(one_channel_in); + auto out_e = framework::EigenVector::Flatten(one_channel_out); + auto& dev = *dev_ctx.eigen_device(); + out_e.device(dev) = (s * scale_two[0] / max_range) * in_e; + } + } + } + } +}; + template struct DequantizeFunctor; template struct DequantizeFunctor; +template struct ChannelDequantizeFunctor; +template struct ChannelDequantizeFunctor; class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel { public: @@ -76,6 +120,63 @@ $$Out = \frac{scale*X}{ max_range }$$ } }; +class FakeChannelWiseDequantizeMaxAbsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("X"), + "Input(X) of FakeChannelWiseDequantizeMaxAbsOp should not be null."); + PADDLE_ENFORCE(ctx->HasInputs("Scales"), + "Input(Scales) of FakeChannelWiseDequantizeMaxAbsOp " + "should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FakeChannelWiseDequantizeMaxAbsOp should not be null."); + + ctx->ShareDim("X", /*->*/ "Out"); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class FakeChannelWiseDequantizeMaxAbsOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor) The input with float-32/64 type is the " + "low precision tensor."); + AddInput("Scales", + "(Tensors) The scales in quantization stage. " + "Now, `Scales` is a vector with at most two tensors. " + "If Scales has two elements, the second tensor should only have " + "one value.") + .AsDuplicable(); + AddOutput("Out", + "(Tensor) The output is the dequantized high " + "precision tensor."); + AddAttr>( + "quant_bits", + "Quantization bit numbers in quantization stage. " + "The size of `quant_bits` should be equal to the size of `Scales`.") + .SetDefault({8}); + + AddComment(R"DOC( +FakeChannelWiseDequantizeMaxAbsOp operator. + +This calculation is an opposite operation of FakeChannelWiseQuantizeMaxAbsOp: + +$$Out_c = \frac{X_c\prod_{i=1}^{n}Scales_{ic}}{\prod_{i=1}^{n}(2^{quant\_bits_i-1}-1)}$$ + +In the above formula, the range value of $c$ can be represented as $0 \leq c \lt \ the\ channel\ number\ of\ X$. +Besides, the size of $quant\_bits$ should be equal to the size of $Scales$, and it is called $n$ in the formula. + +Notes: In general, the per-channel quantization is only applied to weights and the activations use per-layer quantization. +)DOC"); + } +}; + } // namespace operators } // namespace paddle @@ -88,3 +189,11 @@ REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp, REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsKernel, ops::FakeDequantizeMaxAbsKernel); + +REGISTER_OPERATOR(fake_channel_wise_dequantize_max_abs, + ops::FakeChannelWiseDequantizeMaxAbsOp, + ops::FakeChannelWiseDequantizeMaxAbsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(fake_channel_wise_dequantize_max_abs, + ops::FakeChannelWiseDequantizeMaxAbsKernel, + ops::FakeChannelWiseDequantizeMaxAbsKernel); diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu index 225bcc45bc65bc9268d1e866a4358731eaf0c3ef..02f9dc827d68cbb58447ed1557ff4bf310b2c017 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cu +++ b/paddle/fluid/operators/fake_dequantize_op.cu @@ -44,8 +44,66 @@ struct DequantizeFunctor { } }; +template +__global__ void DequantizeOneScale(const T* in, const T* scale, T max_range, + int num, int channel, T* out) { + int tid = threadIdx.x; + int channel_size = num / channel; + const T* in_c = in + blockIdx.x * channel_size; + T* out_c = out + blockIdx.x * channel_size; + for (int i = tid; i < channel_size; i += blockDim.x) { + out_c[i] = in_c[i] * scale[blockIdx.x] / max_range; + } +} + +template +__global__ void DequantizeTwoScale(const T* in, const T* scale_one, + const T* scale_two, T max_range, int num, + int batch_size, int channel, T* out) { + int tid = threadIdx.x; + int channel_size = num / (batch_size * channel); + int scale_index = blockIdx.x % channel; + const T* in_c = in + blockIdx.x * channel_size; + T* out_c = out + blockIdx.x * channel_size; + for (int i = tid; i < channel_size; i += blockDim.x) { + out_c[i] = in_c[i] * scale_one[scale_index] * scale_two[0] / max_range; + } +} + +template +struct ChannelDequantizeFunctor { + void operator()(const platform::CUDADeviceContext& dev_ctx, + const framework::Tensor* in, const framework::Tensor** scales, + const int scale_num, T max_range, framework::Tensor* out) { + const T* in_data = in->data(); + T* out_data = out->mutable_data(dev_ctx.GetPlace()); + if (scale_num == 1) { + int num = in->numel(); + int channel = in->dims()[0]; + const T* scale_factor = scales[0]->data(); + int block = 1024; + int grid = channel; + DequantizeOneScale<<>>( + in_data, scale_factor, max_range, num, channel, out_data); + } else if (scale_num == 2) { + int num = in->numel(); + int batch_size = in->dims()[0]; + int channel = in->dims()[1]; + const T* scale_one = scales[0]->data(); + const T* scale_two = scales[1]->data(); + int block = 1024; + int grid = batch_size * channel; + DequantizeTwoScale<<>>( + in_data, scale_one, scale_two, max_range, num, batch_size, channel, + out_data); + } + } +}; + template struct DequantizeFunctor; template struct DequantizeFunctor; +template struct ChannelDequantizeFunctor; +template struct ChannelDequantizeFunctor; } // namespace operators } // namespace paddle @@ -55,3 +113,7 @@ using CUDA = paddle::platform::CUDADeviceContext; REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsKernel, ops::FakeDequantizeMaxAbsKernel); +REGISTER_OP_CUDA_KERNEL( + fake_channel_wise_dequantize_max_abs, + ops::FakeChannelWiseDequantizeMaxAbsKernel, + ops::FakeChannelWiseDequantizeMaxAbsKernel); diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h index d9923a10daa01ca06ebabb27cf9285b0628634bc..ed9a0a4d65fab5ce1ef48835c332fade978d2bae 100644 --- a/paddle/fluid/operators/fake_dequantize_op.h +++ b/paddle/fluid/operators/fake_dequantize_op.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include +#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -27,6 +29,13 @@ struct DequantizeFunctor { framework::Tensor* out); }; +template +struct ChannelDequantizeFunctor { + void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in, + const framework::Tensor** scales, const int scale_num, + T max_range, framework::Tensor* out); +}; + template class FakeDequantizeMaxAbsKernel : public framework::OpKernel { public: @@ -45,5 +54,43 @@ class FakeDequantizeMaxAbsKernel : public framework::OpKernel { } }; +template +class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel { + public: + virtual void Compute(const framework::ExecutionContext& ctx) const { + auto* in = ctx.Input("X"); + auto scales = ctx.MultiInput("Scales"); + auto* out = ctx.Output("Out"); + + auto quant_bits = ctx.Attr>("quant_bits"); + int max_range = 1; + + auto& dev_ctx = ctx.template device_context(); + out->mutable_data(dev_ctx.GetPlace()); + int scale_num = scales.size(); + if (scale_num == 1) { + PADDLE_ENFORCE_EQ( + scales[0]->numel(), in->dims()[0], + "The number of first scale values must be the same with " + "first dimension value of Input(X) when the `Scales` has only one " + "element."); + max_range *= (std::pow(2, quant_bits[0] - 1) - 1); + } else if (scale_num == 2) { + PADDLE_ENFORCE_EQ( + scales[0]->numel(), in->dims()[1], + "The number of first scale values must be the same with " + "second dimension value of Input(X) when the `Scales` has two " + "elements."); + PADDLE_ENFORCE_EQ( + scales[1]->numel(), 1, + "The second scale tensor should only have one value at now."); + max_range *= (std::pow(2, quant_bits[0] - 1) - 1) * + (std::pow(2, quant_bits[1] - 1) - 1); + } + ChannelDequantizeFunctor()( + dev_ctx, in, scales.data(), scale_num, static_cast(max_range), out); + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index 3bb07d383548e6f4be810c96d2a916c0fe5e45f5..054ef4658cc0c4448d49870849017d3191d57db9 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -37,6 +37,21 @@ struct FindAbsMaxFunctor { template struct FindAbsMaxFunctor; +template +struct FindChannelAbsMaxFunctor { + void operator()(const platform::CPUDeviceContext& ctx, const T* in, + const int num, const int channel, T* out) { + const int channel_size = num / channel; + for (int i = 0; i < channel; i++) { + auto* start = in + i * channel_size; + auto* end = in + (i + 1) * channel_size; + out[i] = std::abs(*(std::max_element(start, end, Compare()))); + } + } +}; + +template struct FindChannelAbsMaxFunctor; + template struct ClipAndFakeQuantFunctor { void operator()(const platform::CPUDeviceContext& ctx, @@ -53,6 +68,36 @@ struct ClipAndFakeQuantFunctor { template struct ClipAndFakeQuantFunctor; +template +struct ChannelClipAndFakeQuantFunctor { + void operator()(const platform::CPUDeviceContext& ctx, + const framework::Tensor& in, const framework::Tensor& scale, + const int bin_cnt, const int channel, + framework::Tensor* out) { + auto* scale_data = scale.data(); + auto* in_data = in.data(); + auto* out_data = out->mutable_data(ctx.GetPlace()); + const int channel_size = in.numel() / channel; + platform::Transform trans; + for (int i = 0; i < channel; i++) { + T s = scale_data[i]; + auto* start = in_data + i * channel_size; + auto* end = in_data + (i + 1) * channel_size; + trans(ctx, start, end, out_data + i * channel_size, + ClipFunctor(-s, s)); + } + for (int i = 0; i < channel; i++) { + T s = scale_data[i]; + framework::Tensor one_channel_out = out->Slice(i, i + 1); + auto out_e = framework::EigenVector::Flatten(one_channel_out); + out_e.device(*ctx.eigen_device()) = (bin_cnt / s * out_e).round(); + } + } +}; + +template struct ChannelClipAndFakeQuantFunctor; + template struct FindRangeAbsMaxFunctor { void operator()(const platform::CPUDeviceContext& ctx, @@ -81,6 +126,30 @@ struct FindRangeAbsMaxFunctor { template struct FindRangeAbsMaxFunctor; +template +struct FindMovingAverageAbsMaxFunctor { + void operator()(const platform::CPUDeviceContext& ctx, + const framework::Tensor& in_accum, + const framework::Tensor& in_state, const T* cur_scale, + const float rate, framework::Tensor* out_state, + framework::Tensor* out_accum, framework::Tensor* out_scale) { + T accum = in_accum.data()[0]; + T state = in_state.data()[0]; + T scale = cur_scale[0]; + + state = rate * state + 1; + accum = rate * accum + scale; + scale = accum / state; + + out_state->mutable_data(ctx.GetPlace())[0] = state; + out_accum->mutable_data(ctx.GetPlace())[0] = accum; + out_scale->mutable_data(ctx.GetPlace())[0] = scale; + } +}; + +template struct FindMovingAverageAbsMaxFunctor; + class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel { public: FakeQuantizeAbsMaxOp(const std::string& type, @@ -134,6 +203,60 @@ $$Out = round(X/scale * range)$$ } }; +class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of FakeChannelWiseQuantizeOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FakeChannelWiseQuantizeOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("OutScale"), + "Output(Scale) of FakeChannelWiseQuantizeOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[0]}); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); + } +}; + +class FakeChannelWiseQuantizeAbsMaxOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor) Input is float data type."); + AddOutput("Out", + "(Tensor) Output of quantized low level tensor, " + "but also saved as float data type."); + AddOutput("OutScale", "(Tensor) Current channel wise scale"); + AddAttr("bit_length", "(int, default 8)") + .SetDefault(8) + .AddCustomChecker([](const int& bit_length) { + PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, + "'bit_length' should be between 1 and 16."); + }); + AddComment(R"DOC( +The scale of FakeChannelWiseQuantize operator is a vector. +In detail, each channel of the input X has a scale value. + +$$scale_c = max(abs(X_c))$$ +$$range = 2^{bit\_length - 1} - 1$$ +$$Out_c = round(\frac{X_c * range} {scale_c})$$ +In above three formulas, the range value of c is as follow: +$$0 \leq c \lt \ the\ channel\ number\ of\ X$$ +)DOC"); + } +}; + class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel { public: FakeQuantizeRangeAbsMaxOp(const std::string& type, @@ -201,6 +324,78 @@ $$Out = round(X/scale * range)$$ } }; +class FakeQuantizeMovingAverageAbsMaxOp : public framework::OperatorWithKernel { + public: + FakeQuantizeMovingAverageAbsMaxOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("X"), + "Input(X) of FakeQuantizeMovingAverageAbsMaxOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FakeQuantizeMovingAverageAbsMaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutScale"), + "Output(OutScale) of FakeQuantizeMovingAverageAbsMaxOp " + "should not be null"); + if (ctx->HasOutput("OutState")) { + ctx->SetOutputDim("OutState", {1}); + } + if (ctx->HasOutput("OutAccum")) { + ctx->SetOutputDim("OutAccum", {1}); + } + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->SetOutputDim("OutScale", {1}); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class FakeQuantizeMovingAverageAbsMaxOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor) Input is float data type."); + AddInput("InScale", "Last scale."); + AddInput("InAccum", "Last accum.").AsDispensable(); + AddInput("InState", "Last state.").AsDispensable(); + AddOutput("Out", "(Tensor) Output of quantized low level tensor."); + AddOutput("OutScale", " Current scale"); + AddOutput("OutState", "(Tensor) state buffer.").AsDispensable(); + AddOutput("OutAccum", "(Tensor) accum buffer.").AsDispensable(); + AddAttr("moving_rate", "(float, default 0.9) moving rate.") + .SetDefault(0.9); + AddAttr("bit_length", "(int, default 8), quantization bit number.") + .SetDefault(8) + .AddCustomChecker([](const int& bit_length) { + PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, + "'bit_length' should be between 1 and 16."); + }); + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); + AddComment(R"DOC( +FakeQuantize operator is used in static quantization. + +$$scale = (0.9*max(abs(x))+accum)/(0.9*state+1)$$ +$$range = 2^{bit_length - 1} - 1$$ +$$Out = round(X/scale * range)$$ + +)DOC"); + } +}; + } // namespace operators } // namespace paddle @@ -218,3 +413,16 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp, paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxKernel); + +REGISTER_OPERATOR(fake_quantize_moving_average_abs_max, + ops::FakeQuantizeMovingAverageAbsMaxOp, + ops::FakeQuantizeMovingAverageAbsMaxOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max, + ops::FakeQuantizeMovingAverageAbsMaxKernel); +REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max, + ops::FakeChannelWiseQuantizeAbsMaxOp, + ops::FakeChannelWiseQuantizeAbsMaxOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max, + ops::FakeChannelWiseQuantizeAbsMaxKernel); diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index a0ff6396210c2b3a7f8bd6b9f274b875d7fd4933..33bd275e5cc507ec700b3694cd8b1df9672ec512 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -74,6 +74,45 @@ struct FindAbsMaxFunctor { template struct FindAbsMaxFunctor; +template +__global__ void FindChannelAbsMaxKernel(const T* in, const int n, const int c, + T* out) { + int tid = threadIdx.x; + int channel_size = n / c; + const T* in_c = in + blockIdx.x * channel_size; + extern __shared__ T shared_max_data[]; + shared_max_data[tid] = T(0); + for (int i = tid; i < channel_size; i += blockDim.x) { + T tmp = fabs(in_c[i]); + if (tmp > shared_max_data[tid]) { + shared_max_data[tid] = tmp; + } + } + __syncthreads(); + for (int i = blockDim.x / 2; i > 0; i >>= 1) { + if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) { + shared_max_data[tid] = shared_max_data[tid + i]; + } + __syncthreads(); + } + if (tid == 0) { + out[blockIdx.x] = shared_max_data[0]; + } +} + +template +struct FindChannelAbsMaxFunctor { + void operator()(const platform::CUDADeviceContext& ctx, const T* in, + const int num, const int channel, T* out) { + int block = 1024; + int grid = channel; + FindChannelAbsMaxKernel<<>>( + in, num, channel, out); + } +}; + +template struct FindChannelAbsMaxFunctor; + template __global__ void ClipAndQuantKernel(const T* in, const T* scale, const int bin_cnt, const int n, T* out) { @@ -82,14 +121,76 @@ __global__ void ClipAndQuantKernel(const T* in, const T* scale, T s = scale[0]; for (int i = bid; i < n; i += blockDim.x * gridDim.x) { - T x = in[bid]; + T x = in[i]; + T v = x > s ? s : x; + v = v < -s ? -s : v; + v = bin_cnt / s * v; + out[i] = round(v); + } +} + +template +struct ClipAndFakeQuantFunctor { + void operator()(const platform::CUDADeviceContext& ctx, + const framework::Tensor& in, const framework::Tensor& scale, + const int bin_cnt, framework::Tensor* out) { + int num = in.numel(); + int block = 1024; + int grid = (block - 1 + num) / block; + + const T* in_data = in.data(); + const T* scale_data = scale.data(); + T* out_data = out->mutable_data(ctx.GetPlace()); + + ClipAndQuantKernel<<>>( + in_data, scale_data, bin_cnt, num, out_data); + } +}; + +template struct ClipAndFakeQuantFunctor; + +template +__global__ void ChannelClipAndQuantKernel(const T* in, const T* scale, + const int bin_cnt, const int n, + const int c, T* out) { + int tid = threadIdx.x; + + int channel_size = n / c; + const T* in_c = in + blockIdx.x * channel_size; + T* out_c = out + blockIdx.x * channel_size; + + T s = scale[blockIdx.x]; + for (int i = tid; i < channel_size; i += blockDim.x) { + T x = in_c[i]; T v = x > s ? s : x; v = v < -s ? -s : v; v = bin_cnt / s * v; - out[bid] = round(v); + out_c[i] = round(v); } } +template +struct ChannelClipAndFakeQuantFunctor { + void operator()(const platform::CUDADeviceContext& ctx, + const framework::Tensor& in, const framework::Tensor& scale, + const int bin_cnt, const int channel, + framework::Tensor* out) { + int num = in.numel(); + int block = 1024; + int grid = channel; + + const T* in_data = in.data(); + const T* scale_data = scale.data(); + T* out_data = out->mutable_data(ctx.GetPlace()); + + ChannelClipAndQuantKernel<<>>( + in_data, scale_data, bin_cnt, num, channel, out_data); + } +}; + +template struct ChannelClipAndFakeQuantFunctor; + template __global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale, const T* last_scale, @@ -148,24 +249,39 @@ struct FindRangeAbsMaxFunctor { template struct FindRangeAbsMaxFunctor; template -struct ClipAndFakeQuantFunctor { +struct FindMovingAverageAbsMaxFunctor { void operator()(const platform::CUDADeviceContext& ctx, - const framework::Tensor& in, const framework::Tensor& scale, - const int bin_cnt, framework::Tensor* out) { - int num = in.numel(); - int block = 1024; - int grid = (block - 1 + num) / block; + const framework::Tensor& in_accum, + const framework::Tensor& in_state, const T* cur_scale, + const float rate, framework::Tensor* out_state, + framework::Tensor* out_accum, framework::Tensor* out_scale) { + const auto gpu_place = boost::get(ctx.GetPlace()); - const T* in_data = in.data(); - const T* scale_data = scale.data(); - T* out_data = out->mutable_data(ctx.GetPlace()); + T accum; + memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data(), + sizeof(T), 0); + T state; + memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data(), + sizeof(T), 0); + T scale; + memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T), + 0); - ClipAndQuantKernel<<>>( - in_data, scale_data, bin_cnt, num, out_data); + state = rate * state + 1; + accum = rate * accum + scale; + scale = accum / state; + + memory::Copy(gpu_place, out_accum->mutable_data(gpu_place), + platform::CPUPlace(), &accum, sizeof(T), 0); + memory::Copy(gpu_place, out_state->mutable_data(gpu_place), + platform::CPUPlace(), &state, sizeof(T), 0); + memory::Copy(gpu_place, out_scale->mutable_data(gpu_place), + platform::CPUPlace(), &scale, sizeof(T), 0); } }; -template struct ClipAndFakeQuantFunctor; +template struct FindMovingAverageAbsMaxFunctor; } // namespace operators } // namespace paddle @@ -174,5 +290,10 @@ namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max, ops::FakeQuantizeAbsMaxKernel); +REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max, + ops::FakeChannelWiseQuantizeAbsMaxKernel); REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxKernel); +REGISTER_OP_CUDA_KERNEL( + fake_quantize_moving_average_abs_max, + ops::FakeQuantizeMovingAverageAbsMaxKernel); diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h index 7ace7573ec5c03ab8788cfc0aab614b7f80ea073..5ab38b086df7f9df33996ec83b5ec07047c204ba 100644 --- a/paddle/fluid/operators/fake_quantize_op.h +++ b/paddle/fluid/operators/fake_quantize_op.h @@ -42,12 +42,33 @@ struct FindRangeAbsMaxFunctor { framework::Tensor* scales_arr, framework::Tensor* out_scale); }; +template +struct FindChannelAbsMaxFunctor { + void operator()(const DeviceContext& ctx, const T* in, const int num, + const int channel, T* out); +}; + +template +struct ChannelClipAndFakeQuantFunctor { + void operator()(const DeviceContext& ctx, const framework::Tensor& in, + const framework::Tensor& scale, const int bin_cnt, + const int channel, framework::Tensor* out); +}; + +template +struct FindMovingAverageAbsMaxFunctor { + void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum, + const framework::Tensor& in_state, + const framework::Tensor& cur_scale, + framework::Tensor* out_state, framework::Tensor* out_accum, + framework::Tensor* out_scale); +}; + template class FakeQuantizeAbsMaxKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* in = context.Input("X"); - auto* out = context.Output("Out"); auto* out_scale = context.Output("OutScale"); T* out_s = out_scale->mutable_data(context.GetPlace()); @@ -63,6 +84,28 @@ class FakeQuantizeAbsMaxKernel : public framework::OpKernel { } }; +template +class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + + auto* out = context.Output("Out"); + auto* out_scale = context.Output("OutScale"); + T* out_scale_data = out_scale->mutable_data(context.GetPlace()); + out->mutable_data(context.GetPlace()); + + int bit_length = context.Attr("bit_length"); + int bin_cnt = std::pow(2, bit_length - 1) - 1; + + auto& dev_ctx = context.template device_context(); + FindChannelAbsMaxFunctor()( + dev_ctx, in->data(), in->numel(), in->dims()[0], out_scale_data); + ChannelClipAndFakeQuantFunctor()( + dev_ctx, *in, *out_scale, bin_cnt, in->dims()[0], out); + } +}; + template class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel { public: @@ -105,5 +148,54 @@ class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel { } }; +template +class FakeQuantizeMovingAverageAbsMaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* in_scale = context.Input("InScale"); + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + bool is_test = context.Attr("is_test"); + int bit_length = context.Attr("bit_length"); + int bin_cnt = std::pow(2, bit_length - 1) - 1; + auto& dev_ctx = context.template device_context(); + + // testing + if (is_test) { + ClipAndFakeQuantFunctor()(dev_ctx, *in, *in_scale, + bin_cnt, out); + return; + } + + // training + auto* in_accum = context.Input("InAccum"); + auto* in_state = context.Input("InState"); + auto& allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + auto cur_scale = allocator.Allocate(1 * sizeof(T)); + T* cur_scale_data = static_cast(cur_scale->ptr()); + + FindAbsMaxFunctor()(dev_ctx, in->data(), in->numel(), + cur_scale_data); + + auto* out_state = context.Output("OutState"); + auto* out_accum = context.Output("OutAccum"); + auto* out_scale = context.Output("OutScale"); + out_state->mutable_data(context.GetPlace()); + out_accum->mutable_data(context.GetPlace()); + out_scale->mutable_data(context.GetPlace()); + float moving_rate = context.Attr("moving_rate"); + + FindMovingAverageAbsMaxFunctor()( + dev_ctx, *in_accum, *in_state, cur_scale_data, moving_rate, out_state, + out_accum, out_scale); + + ClipAndFakeQuantFunctor()(dev_ctx, *in, *out_scale, + bin_cnt, out); + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index eb4617a9359353820fc41b9ad1c8db5327fdacde..242f5390b806756283686dae2e2c32b93c2bd71e 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -55,17 +55,8 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { "The input tensor Input's rank of FCOp should be larger than " "in_num_col_dims."); - auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims); - PADDLE_ENFORCE_EQ( - in_mat_dims[1], w_dims[0], - "Fully Connected input and weigth size do not match. %s, %s"); - std::vector output_dims; - output_dims.reserve(static_cast(in_num_col_dims + 1)); - for (int i = 0; i < in_num_col_dims; ++i) { - output_dims.push_back(in_dims[i]); - } - output_dims.push_back(w_dims[1]); + FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims); ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); ctx->ShareLoD("Input", "Out"); @@ -128,6 +119,9 @@ void FCOpMaker::Make() { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr(framework::kAllKernelsMustComputeRuntimeShape, + "Skip calling InferShape() function in the runtime.") + .SetDefault(true); AddComment(R"DOC( Fully Connected Operator. @@ -142,13 +136,20 @@ class FCOpKernel : public framework::OpKernel { void Compute(const paddle::framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); - auto input = ctx.Input("Input"); + auto input = ctx.Input("Input"); auto w = ctx.Input("W"); auto bias = ctx.Input("Bias"); - auto output = ctx.Output("Out"); + auto output = ctx.Output("Out"); + int in_num_col_dims = ctx.Attr("in_num_col_dims"); auto w_dims = w->dims(); + + std::vector output_dims; + FCOutputSize(input->dims(), w_dims, output_dims, in_num_col_dims); + output->Resize(framework::make_ddim(output_dims)); + output->set_lod(input->lod()); + auto out_dims = output->dims(); - int M = framework::product(out_dims) / out_dims[out_dims.size() - 1]; + int M = framework::product(out_dims) / w_dims[1]; const T* input_data = input->data(); const T* w_data = w->data(); diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h index e1b780fc0c401fbf34a9db03aa31137cbc016939..b82a63cd830b569c4541bbaffb5affb75394773a 100644 --- a/paddle/fluid/operators/fc_op.h +++ b/paddle/fluid/operators/fc_op.h @@ -48,5 +48,21 @@ class FCOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override; }; +inline void FCOutputSize(const framework::DDim& in_dims, + const framework::DDim& w_dims, + std::vector& out_dims, // NOLINT + int in_num_col_dims) { + auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims); + PADDLE_ENFORCE_EQ( + in_mat_dims[1], w_dims[0], + "Fully Connected input and weigth size do not match. %s, %s"); + + out_dims.reserve(static_cast(in_num_col_dims + 1)); + for (int i = 0; i < in_num_col_dims; ++i) { + out_dims.push_back(in_dims[i]); + } + out_dims.push_back(w_dims[1]); +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index c86430524e182acd66c61e3e01672a32f15a62c3..cf2f4776cf2ae9a707d3b841c2a41b7f82ca7833 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -39,12 +39,11 @@ class FillConstantOp : public framework::OperatorWithKernel { class FillConstantOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { + void operator()(framework::InferVarTypeContext* ctx) const override { auto data_type = static_cast( - boost::get(op_desc.GetAttr("dtype"))); - auto& out_var_name = op_desc.Output("Out").front(); - block->Var(out_var_name)->SetDataType(data_type); + boost::get(ctx->GetAttr("dtype"))); + auto& out_var_name = ctx->Output("Out").front(); + ctx->SetDataType(out_var_name, data_type); } }; diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc index bb904166c4a19997a57723d9f2e50cc839aae960..7f43a1cfe977a63b5ffb6bd8dc96bf696ed15282 100644 --- a/paddle/fluid/operators/flatten_op.cc +++ b/paddle/fluid/operators/flatten_op.cc @@ -267,14 +267,10 @@ class Flatten2GradOp : public framework::OperatorBase { } }; -class FlattenOpInplaceInToOut : public framework::InplaceInToOut { +class FlattenOpInplaceInToOut : public framework::InplaceOpInference { public: - using InplaceInToOut::InplaceInToOut; - - protected: - std::unordered_map Apply( - const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { + std::unordered_map operator()( + const framework::OpDesc &op_desc) const override { std::unordered_map inplace_in_to_out = { {"X", "Out"}, }; @@ -282,13 +278,10 @@ class FlattenOpInplaceInToOut : public framework::InplaceInToOut { } }; -class FlattenGradInplaceinToOut : public framework::InplaceInToOut { - using InplaceInToOut::InplaceInToOut; - - protected: - std::unordered_map Apply( - const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { +class FlattenGradInplaceinToOut : public framework::InplaceOpInference { + public: + std::unordered_map operator()( + const framework::OpDesc &op_desc) const override { std::unordered_map inplace_in_to_out = { {framework::GradVarName("Out"), framework::GradVarName("X")}, }; diff --git a/paddle/fluid/operators/fsp_op.cc b/paddle/fluid/operators/fsp_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..fbe8e56a6160219175bd573a2ff186eb35e56fdf --- /dev/null +++ b/paddle/fluid/operators/fsp_op.cc @@ -0,0 +1,128 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fsp_op.h" + +namespace paddle { +namespace operators { + +class FSPOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of FSPOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of FSPOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of FSPOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE( + x_dims.size() == 4, + "The Input(X) must have shape [batch_size, channel, height, width]."); + PADDLE_ENFORCE( + y_dims.size() == 4, + "The Input(Y) must have shape [batch_size, channel, height, width]."); + PADDLE_ENFORCE( + (x_dims[2] == y_dims[2]) && (x_dims[3] == y_dims[3]), + "The Input(X) and Input(Y) should have the same height and width."); + + ctx->SetOutputDim("Out", {x_dims[0], x_dims[1], y_dims[1]}); + ctx->ShareLoD("X", "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::LibraryType library_{framework::LibraryType::kPlain}; + framework::DataLayout layout_ = framework::DataLayout::kAnyLayout; + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context(), layout_, library_); + } +}; + +class FSPOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor) The input of FSP op with shape [batch_size, x_channel, " + "height, width]"); + AddInput("Y", + "(Tensor) The input of FSP op with shape" + "[batch_size, y_channel, height, width]." + "The y_channel can be different with the x_channel of Input(X)" + " while the other dimensions must be the same with Input(X)'s."); + AddOutput( + "Out", + "(Tensor) The output of FSP op with shape " + "[batch_size, x_channel, y_channel]. The x_channel is the channel " + "of Input(X) and the y_channel is the channel of Input(Y)."); + AddComment(R"DOC( + This op is used to calculate the flow of solution procedure (FSP) matrix of two feature maps. + Given feature map x with shape [x_channel, h, w] and feature map y with shape + [y_channel, h, w], we can get the fsp matrix of x and y in two steps: + + step 1: reshape x into matrix with shape [x_channel, h * w] and reshape and + transpose y into matrix with shape [h * w, y_channel] + step 2: multiply x and y to get fsp matrix with shape [x_channel, y_channel] + + The output is a batch of fsp matrices. + )DOC"); + } +}; + +class FSPOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, y_dims); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fsp, ops::FSPOp, ops::FSPOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(fsp_grad, ops::FSPOpGrad); +REGISTER_OP_CPU_KERNEL( + fsp, ops::FSPOpKernel, + ops::FSPOpKernel); +REGISTER_OP_CPU_KERNEL( + fsp_grad, ops::FSPGradOpKernel, + ops::FSPGradOpKernel); diff --git a/paddle/fluid/operators/fsp_op.cu b/paddle/fluid/operators/fsp_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..4fd7ba04ff9af1806963427ad58c68fc216e82ac --- /dev/null +++ b/paddle/fluid/operators/fsp_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/fsp_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL(fsp, ops::FSPOpKernel, + ops::FSPOpKernel); +REGISTER_OP_CUDA_KERNEL(fsp_grad, + ops::FSPGradOpKernel, + ops::FSPGradOpKernel); diff --git a/paddle/fluid/operators/fsp_op.h b/paddle/fluid/operators/fsp_op.h new file mode 100644 index 0000000000000000000000000000000000000000..544af2b7d9b9729fe5dce08793da6c983fbcc6fa --- /dev/null +++ b/paddle/fluid/operators/fsp_op.h @@ -0,0 +1,136 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class FSPOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* y = context.Input("Y"); + auto* output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + auto x_dims = x->dims(); + auto y_dims = y->dims(); + + auto batch_size = x_dims[0]; + auto x_channel = x_dims[1]; + auto y_channel = y_dims[1]; + auto height = x_dims[2]; + auto width = x_dims[3]; + + auto blas = math::GetBlas(context); + + math::MatDescriptor x_mat_desc; + x_mat_desc.height_ = x_channel; + x_mat_desc.width_ = height * width; + x_mat_desc.batch_size_ = batch_size; + x_mat_desc.stride_ = x_channel * height * width; + + math::MatDescriptor y_mat_desc; + y_mat_desc.height_ = height * width; + y_mat_desc.width_ = y_channel; + y_mat_desc.batch_size_ = batch_size; + y_mat_desc.stride_ = y_channel * height * width; + y_mat_desc.trans_ = true; + + blas.MatMul(*x, x_mat_desc, *y, y_mat_desc, + static_cast(1.0 / (height * width)), output, + static_cast(0.0)); + } +}; + +template +class FSPGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* d_x = context.Output(framework::GradVarName("X")); + auto* d_y = context.Output(framework::GradVarName("Y")); + if (d_x == nullptr && d_y == nullptr) { + return; + } + auto* d_out = context.Input(framework::GradVarName("Out")); + auto d_out_dims = d_out->dims(); + auto batch_size = d_out_dims[0]; + auto x_channel = d_out_dims[1]; + auto y_channel = d_out_dims[2]; + int64_t h = 0; + int64_t w = 0; + + auto blas = math::GetBlas(context); + math::SetConstant set_zero; + if (d_x != nullptr) { + d_x->mutable_data(context.GetPlace()); + set_zero(context.template device_context(), d_x, + static_cast(0)); + auto* y = context.Input("Y"); + auto y_dims = y->dims(); + h = y_dims[2]; + w = y_dims[3]; + + math::MatDescriptor d_out_mat_desc; + d_out_mat_desc.height_ = x_channel; + d_out_mat_desc.width_ = y_channel; + d_out_mat_desc.batch_size_ = batch_size; + d_out_mat_desc.stride_ = x_channel * y_channel; + + math::MatDescriptor y_mat_desc; + y_mat_desc.height_ = y_channel; + y_mat_desc.width_ = h * w; + y_mat_desc.batch_size_ = batch_size; + y_mat_desc.stride_ = y_channel * h * w; + + blas.MatMul(*d_out, d_out_mat_desc, *y, y_mat_desc, + static_cast(1.0 / (h * w)), d_x, static_cast(0.0)); + } + + if (d_y != nullptr) { + d_y->mutable_data(context.GetPlace()); + set_zero(context.template device_context(), d_y, + static_cast(0)); + auto* x = context.Input("X"); + auto x_dims = x->dims(); + h = x_dims[2]; + w = x_dims[3]; + + math::MatDescriptor d_out_mat_desc; + d_out_mat_desc.height_ = y_channel; + d_out_mat_desc.width_ = x_channel; + d_out_mat_desc.batch_size_ = batch_size; + d_out_mat_desc.stride_ = x_channel * y_channel; + d_out_mat_desc.trans_ = true; + + math::MatDescriptor x_mat_desc; + x_mat_desc.height_ = x_channel; + x_mat_desc.width_ = h * w; + x_mat_desc.batch_size_ = batch_size; + x_mat_desc.stride_ = x_channel * h * w; + + blas.MatMul(*d_out, d_out_mat_desc, *x, x_mat_desc, + static_cast(1.0 / (h * w)), d_y, static_cast(0.0)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc index 80caf70b08e65932d6ccb90a5293d072b2b2bc72..9cc94ab88d59dbf8215aca6cd8be3ba19afe32d0 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc @@ -23,9 +23,6 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (ctx->IsRuntime()) { - return; - } PADDLE_ENFORCE(ctx->HasInput("W"), "Input W of FusedEmbeddingSeqPoolOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Ids"), @@ -91,6 +88,9 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { "(boolean, default false) " "Sparse update.") .SetDefault(false); + AddAttr(framework::kAllKernelsMustComputeRuntimeShape, + "Skip calling InferShape() function in the runtime.") + .SetDefault(true); AddComment(R"DOC( FusedEmbeddingSeqPool Operator. @@ -138,22 +138,20 @@ class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel { class FusedEmbeddingSeqPoolOpGradVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { - auto out_var_name = op_desc.Output(framework::GradVarName("W")).front(); - auto attr = op_desc.GetAttr("is_sparse"); + void operator()(framework::InferVarTypeContext* ctx) const override { + auto out_var_name = ctx->Output(framework::GradVarName("W")).front(); + auto attr = ctx->GetAttr("is_sparse"); bool is_sparse = boost::get(attr); if (is_sparse) { VLOG(3) << "fused_embedding_seq_pool_grad op " << framework::GradVarName("W") << " is set to SelectedRows"; - block->Var(out_var_name) - ->SetType(framework::proto::VarType::SELECTED_ROWS); + ctx->SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS); } else { VLOG(3) << "fused_embedding_seq_pool_grad op " << framework::GradVarName("W") << " is set to LoDTensor"; - block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); + ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR); } - block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); + ctx->SetDataType(out_var_name, ctx->GetDataType(ctx->Input("W")[0])); } }; diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index f13c02038606e52337b7ef85545e37054e54b631..4651c2b2ba81a404b64818fec81cef79634ff036 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -52,8 +52,9 @@ struct EmbeddingVSumFunctor { out_width, jit::SeqPoolType::kSum); for (size_t i = 0; i != ids_lod.size() - 1; ++i) { attr.index_height = ids_lod[i + 1] - ids_lod[i]; - auto emb_seqpool = jit::Get, - platform::CPUPlace>(attr); + auto emb_seqpool = + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(attr); emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width, &attr); } @@ -120,6 +121,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { auto *ids = context.Input("Ids"); auto *d_output = context.Input(framework::GradVarName("Out")); auto *d_table = context.Output(framework::GradVarName("W")); + // runtime shape + d_table->set_height(table_dim[0]); auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); @@ -135,8 +138,9 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { T *d_table_data = d_table_value->mutable_data(context.GetPlace()); const T *d_output_data = d_output->data(); - auto vbroadcast = jit::Get, - platform::CPUPlace>(out_width); + auto vbroadcast = + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(out_width); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { int64_t h = static_cast(lod[i + 1] - lod[i]); const T *src = d_output_data + i * out_width; diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 66acba49e5ac25c5097042225ccfe30b258040fa..ba5f0747c4d04bbb41f34dc7f895b22d38392ea6 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -182,29 +182,32 @@ class FusionGRUKernel : public framework::OpKernel { const int total_T = x_dims[0]; \ const int D3 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - auto* h0 = ctx.Input("H0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* bias = ctx.Input("Bias"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const jit::gru_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("activation"))); \ - jit::gru_t one_step; \ - auto ComputeH1 = \ - jit::Get, platform::CPUPlace>(attr); \ - auto ComputeHtPart1 = \ - jit::Get, platform::CPUPlace>(attr); \ - auto ComputeHtPart2 = \ - jit::Get, platform::CPUPlace>(attr); \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - auto place = ctx.GetPlace(); \ +#define INIT_OTHER_DEFINES \ + auto* h0 = ctx.Input("H0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* bias = ctx.Input("Bias"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const jit::gru_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("activation"))); \ + jit::gru_t one_step; \ + auto ComputeH1 = \ + jit::KernelFuncs, platform::CPUPlace>::Cache().At( \ + attr); \ + auto ComputeHtPart1 = \ + jit::KernelFuncs, platform::CPUPlace>::Cache() \ + .At(attr); \ + auto ComputeHtPart2 = \ + jit::KernelFuncs, platform::CPUPlace>::Cache() \ + .At(attr); \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + auto place = ctx.GetPlace(); \ T* xx_data = xx->mutable_data(place) void SeqCompute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index b11b7c11bfe0ae4c79d5bb39844bce618649c44d..c8c07bd126d5b4eac688d43fd794856f8222525a 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -235,32 +235,34 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D = wh_dims[0]; \ const int D4 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wp_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } \ - const jit::lstm_attr_t attr( \ - D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ - jit::to_kerneltype(ctx.Attr("candidate_activation")), \ - jit::to_kerneltype(ctx.Attr("cell_activation")), \ - use_peepholes); \ - jit::lstm_t one_step; \ - one_step.wp = wp_data; \ - one_step.checked = checked_cell_data; \ - auto ComputeC1H1 = \ - jit::Get, platform::CPUPlace>(attr); \ - auto ComputeCtHt = \ - jit::Get, platform::CPUPlace>(attr) +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const jit::lstm_attr_t attr( \ + D, jit::to_kerneltype(ctx.Attr("gate_activation")), \ + jit::to_kerneltype(ctx.Attr("candidate_activation")), \ + jit::to_kerneltype(ctx.Attr("cell_activation")), \ + use_peepholes); \ + jit::lstm_t one_step; \ + one_step.wp = wp_data; \ + one_step.checked = checked_cell_data; \ + auto ComputeC1H1 = \ + jit::KernelFuncs, platform::CPUPlace>::Cache().At( \ + attr); \ + auto ComputeCtHt = \ + jit::KernelFuncs, platform::CPUPlace>::Cache().At( \ + attr) // Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc index 8ecdf2ed9d40e7f5dc9226c635a8c8e6406a76ba..6be35de65f48525b2da7d5c9ef260b2d0798b67b 100644 --- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc @@ -82,9 +82,11 @@ template static void fc_relu(const T* x, const T* w, const T* b, T* y, const jit::matmul_attr_t& attr) { auto matmul = - jit::Get, platform::CPUPlace>(attr); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); auto addbias_relu = - jit::Get, platform::CPUPlace>(attr.n); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr.n); matmul(x, w, y, &attr); T* dst = y; for (int i = 0; i < attr.m; ++i) { diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc index d48bdafe0aa38cb860b54b2e41ebad3421b93bce..25916768c08e7222ba95bd6e1999400a923b21a3 100644 --- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc @@ -98,7 +98,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel { attr.type = jit::SeqPoolType::kSqrt; } auto seqpool = - jit::Get, platform::CPUPlace>( + jit::KernelFuncs, platform::CPUPlace>::Cache().At( attr); size_t n = ins.size(); size_t dst_step_size = n * w; diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc index 8493f4468fc994964116d99dc85dd34fb19a44cc..53679ebddee1ceec102b5861c54b398aa4da4cde 100644 --- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc +++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc @@ -94,19 +94,23 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { int o_numel = attr.m * attr.n; auto vsquare_x = - jit::Get, platform::CPUPlace>(attr.m * - attr.k); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr.m * attr.k); auto vsquare_y = - jit::Get, platform::CPUPlace>(attr.k * - attr.n); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr.k * attr.n); auto vsquare_xy = - jit::Get, platform::CPUPlace>(o_numel); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + o_numel); auto vsub = - jit::Get, platform::CPUPlace>(o_numel); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + o_numel); auto vscal = - jit::Get, platform::CPUPlace>(o_numel); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + o_numel); auto matmul = - jit::Get, platform::CPUPlace>(attr); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); const T* x_data = x->data(); const T* y_data = y->data(); diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h index e4df59c5d51c390cf593add0c5562665c91f33f6..5bc2e63757f19c1dc8a7d41fae9621a2816ff31b 100644 --- a/paddle/fluid/operators/gather.cu.h +++ b/paddle/fluid/operators/gather.cu.h @@ -64,6 +64,7 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src, for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; const T* p_src = src.data(); + // why must be int? const int* p_index = index.data(); T* p_output = output->data(); diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index 55cef93aacd43174edefbb8aa740bcbea3d8feef..91f3818f2165c91eef88921859afe5703bd65685 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/gather_op.h" +#include +#include +#include #include "paddle/fluid/framework/ddim.h" namespace paddle { @@ -59,8 +62,9 @@ class GatherGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); } }; @@ -94,13 +98,34 @@ Out = [[3, 4], )DOC"); } }; + +class GatherGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("gather_grad"); + op->SetInput("Index", Input("Index")); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(GatherGradNoNeedBufferVarInference, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(gather_grad, ops::GatherGradOp); + ops::GatherGradOpDescMaker); +REGISTER_OPERATOR(gather_grad, ops::GatherGradOp, + ops::GatherGradNoNeedBufferVarInference); REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel, ops::GatherOpKernel, ops::GatherOpKernel, ops::GatherOpKernel, diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc index a4ae19d9c1e3bb2af3eb95650fbb5aabb8944a36..c0893359af2f4de4ed8fd88ebff122447e8d84c7 100644 --- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc +++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc @@ -81,15 +81,12 @@ GetTensorFromSelectedRows is used to get the tensor from SelectedRows. class GetTensorFromSelectedRowsOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const final { - auto out_var_name = op_desc.Output("Out").front(); - auto in_var_name = op_desc.Input("X").front(); - - auto out_var = block->FindRecursiveOrCreateVar(out_var_name); - auto in_var = block->FindRecursiveOrCreateVar(in_var_name); - out_var.SetType(framework::proto::VarType::LOD_TENSOR); - out_var.SetDataType(in_var.GetDataType()); + void operator()(framework::InferVarTypeContext *ctx) const { // NOLINT + auto out_var_name = ctx->Output("Out").front(); + auto in_var_name = ctx->Input("X").front(); + + ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR); + ctx->SetDataType(out_var_name, ctx->GetDataType(in_var_name)); } }; diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc index cbdffa0db8277dbf7257c3b3c1d03c1b459d5b2b..2ab40f482d7a1463703085037bcb94fd4aecf377 100644 --- a/paddle/fluid/operators/group_norm_op.cc +++ b/paddle/fluid/operators/group_norm_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/group_norm_op.h" +#include #include +#include namespace paddle { namespace operators { @@ -170,26 +172,18 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker { } }; -class GroupNormInplaceInToOut : public framework::InplaceInToOut { +class GroupNormInplaceInToOut : public framework::InplaceOpInference { public: - using InplaceInToOut::InplaceInToOut; - - protected: - std::unordered_map Apply( - const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { + std::unordered_map operator()( + const framework::OpDesc &op_desc) const override { return {{"X", "Y"}}; } }; -class GroupNormGradInplaceInToOut : public framework::InplaceInToOut { +class GroupNormGradInplaceInToOut : public framework::InplaceOpInference { public: - using InplaceInToOut::InplaceInToOut; - - protected: - std::unordered_map Apply( - const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { + std::unordered_map operator()( + const framework::OpDesc &op_desc) const override { return {{framework::GradVarName("Y"), framework::GradVarName("X")}}; } }; diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc index 7a29f80ff1ce413519ea9cea6a35747bdced5885..82222d0a7e739e15a779541c14576ce2de24a3d2 100644 --- a/paddle/fluid/operators/hash_op.cc +++ b/paddle/fluid/operators/hash_op.cc @@ -26,9 +26,6 @@ class HashOp : public framework::OperatorWithKernel { : OperatorWithKernel(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - if (ctx->IsRuntime()) { - return; - } PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of HashOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), @@ -57,6 +54,9 @@ $$Out = scale * X$$ )DOC"); AddAttr("num_hash", "").SetDefault(1); AddAttr("mod_by", "").SetDefault(100000); + AddAttr(framework::kAllKernelsMustComputeRuntimeShape, + "Skip calling InferShape() function in the runtime.") + .SetDefault(true); } }; diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 6ca6f0bc04aa696852ed7338dcb4b88a49b2fc81..d0e1057c4357e372d3ab396841de7b2d0577d365 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -197,38 +197,32 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { class HierarchicalSigmoidGradOpGradVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { - auto w_grad_var_name = op_desc.Output(framework::GradVarName("W")).front(); - auto bias_grad_var_name_vec = - op_desc.Output(framework::GradVarName("Bias")); + void operator()(framework::InferVarTypeContext* ctx) const override { + auto w_grad_var_name = ctx->Output(framework::GradVarName("W")).front(); + auto bias_grad_var_name_vec = ctx->Output(framework::GradVarName("Bias")); std::string bias_grad_var_name; bool hasBias = false; if (bias_grad_var_name_vec.size()) { hasBias = true; - bias_grad_var_name = - op_desc.Output(framework::GradVarName("Bias")).front(); + bias_grad_var_name = ctx->Output(framework::GradVarName("Bias")).front(); } - auto attr = op_desc.GetAttr("is_sparse"); + auto attr = ctx->GetAttr("is_sparse"); bool is_sparse = boost::get(attr); if (is_sparse) { VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") << " is set to SelectedRows"; - block->Var(w_grad_var_name) - ->SetType(framework::proto::VarType::SELECTED_ROWS); + ctx->SetType(w_grad_var_name, framework::proto::VarType::SELECTED_ROWS); } else { VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") << " is set to LoDTensor"; - block->Var(w_grad_var_name) - ->SetType(framework::proto::VarType::LOD_TENSOR); + ctx->SetType(w_grad_var_name, framework::proto::VarType::LOD_TENSOR); } if (hasBias) { VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("Bias") << " is set to LoDTensor"; - block->Var(bias_grad_var_name) - ->SetType(framework::proto::VarType::LOD_TENSOR); + ctx->SetType(bias_grad_var_name, framework::proto::VarType::LOD_TENSOR); } - block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType()); + ctx->SetDataType(w_grad_var_name, ctx->GetDataType(ctx->Input("W")[0])); } }; diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt index 35775d7ec9efcdbad69e4491792f7d4e513832ad..47d6c83f2adf8c4b7476410ce7c1d435633a8bfe 100644 --- a/paddle/fluid/operators/jit/CMakeLists.txt +++ b/paddle/fluid/operators/jit/CMakeLists.txt @@ -5,7 +5,7 @@ file(APPEND ${jit_file} "\#pragma once\n") file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n") file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n") -set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) +set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place xxhash) file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc) diff --git a/paddle/fluid/operators/jit/README.en.md b/paddle/fluid/operators/jit/README.en.md index 8670ec2ff28ac8353217e0ee2f8c9b784e488ac7..7d4dc6d47a512ee7ed75d99800968a38de98f090 100644 --- a/paddle/fluid/operators/jit/README.en.md +++ b/paddle/fluid/operators/jit/README.en.md @@ -1,7 +1,7 @@ # JIT Kernel JIT(Just In Time) Kernel contains actually generated code and some other implemenations with the same logic. -Each implementations has its own condition to use, defined in `UseMe`. +Each implementation has its own condition to use, defined in `CanBeUsed`. They are combined together to get the best performance of one single independent function. They could be some very simple functions like vector multiply, or some complicated functions like LSTM. And they can be composed with some other exited jit kernels to build up a complex function. @@ -42,35 +42,62 @@ All basical definations of jit kernels are addressed in `paddle/fluid/operators/ ## How to use -One simple function `jit::Get`, which is very easy to use, is supported to get the kernel. -It can automatically return the expected function with best performance under the given attributes. -All kernels are inlcuded in `paddle/fluid/operators/jit/kernels.h`, you can only include this one header to get all the registered kernels. +We present these methods to get the functions: +- `GetAllCandidateFuncs`. It can return all the implementations supported. All of the implementations can get the same result. You can do some runtime benchmark to choose which should actually be used. +- `GetDefaultBestFunc`. It only return one default function pointer, which is tuning offline with some genenal configures and attributes. This should cover most situations. +- `KernelFuncs::Cache()`. It can get the default functions and save it for next time with the same attribute. +- `GetReferFunc`. It can only get the reference code in CPU, and all the others implementations have same logic with this reference code. + +And here are some examples: + +Get from cache: + +```cpp + using T = float; + jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum); + auto seqpool_func = jit::KernelFuncs, platform::CPUPlace>::Cache().At(attr); + seqpool_func(src_data, dst_data, &attr); +``` + +Get all implementations and run once: + +```cpp + using T = float; + jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum); + auto funcs = jit::GetAllCandidateFuncsWithTypes, platform::CPUPlace>(attr); + for (auto f : funcs) { + LOG(INFO) << "Kernel implementation type: " << f.first; + f.second(src_data, dst_data, &attr); + } +``` + +All kernels are inlcuded in `paddle/fluid/operators/jit/kernels.h`, which is automatically generated in compile time, you can only include this one header to get all the registered kernels. ## Solid Test - Unit Test All functions should be compared with the corresponding reference functions, including data tyep `float` and `double`. - Benchmark - All functions should be tested, and make sure the `jit::Get` function obtain the best performance with all attributes. + All functions should be tested, and make sure the `jit::GetDefaultBestFunc` function obtain the best performance with all attributes. # How to add new kernel ## Required 1. Add `your_key` at `KernelType`. -2. Add reference function of `your_key`. +2. Add your new `KernelTuple` which must include `your_key`. It should be a combination of the data type, attribute type and function type. You can refer `SeqPoolTuple`. +3. Add reference function of `your_key`. Note: - this should be run on CPU and do not depend on any third-party. - Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used. -3. Add unit test in `test.cc`, and verfiy at least `float` and `double`. +4. Add unit test in `test.cc`, and verfiy at least `float` and `double`. Test more data type for some special functions if necessary, for example `int8`. -4. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `jit::Get` always get the best one. +5. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `GetDefaultBestFunc` always get the best one. ## Optional Add more implementations of `your_kery` for performance enhancement. -1. Add functions based on generated code in `gen`. It should be derived from `JitCode` and should have corepsonding creator from `JitCodeCreator` which will be registered on the `your_key`. -Note: Add new `KernelTuples` if necessary,your can refer to `XYZNTuples`. -Specialie method `JitCodeKey` when add new attribute type。 -2. Add more functions in `more`,you can use any third party you wish, like mkl, mkldnn or intrinsic code to reach the best performance. +1. Add functions based on generated code in `gen`. It should be derived from `JitCode` and should have correpsonding creator from `JitCodeCreator` which will be registered on the `your_key`. +2. If new attribute type is added, you should specialize `JitCodeKey` of this type. +3. Add more functions in `more`,you can use any third party you wish, like mkl, mkldnn or intrinsic code to reach the best performance. diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md index cc19f09f56ddf6a7c74d6605ab3f1bd059f19bb8..770548c5260f73f038f52e0b06b77ba698851997 100644 --- a/paddle/fluid/operators/jit/README.md +++ b/paddle/fluid/operators/jit/README.md @@ -1,7 +1,7 @@ # JIT Kernel 结合函数模板和JIT生成需要的kernel函数。 -这里的kernel是比Operator中kernel更小级别的算子单元,更侧重的是在不同硬件上的性能。可以有多重第三方库的实现,每种实现有自己的`UseMe`函数负责什么条件下可以被调用。 +这里的kernel是比Operator中kernel更小级别的算子单元,更侧重的是在不同硬件上的性能。可以有多重第三方库的实现,每种实现有自己的`CanBeUsed`函数负责什么条件下可以被调用。 这里实现的函数可以非常细粒度的函数方法,比如Vector MUL, 也可以是一个复杂的逻辑比如LSTM等。复杂的逻辑也可以由自己的底层函数拼接而成。 目前仅支持CPU上的高性能计算。 @@ -39,27 +39,55 @@ PaddlePaddle/Paddle/paddle/fluid/ ## 动态获取 -提供一个`jit::Get`方法,根据kernel类别获取,每种实现都有自己的使用范围,根据范围动态和当前条件选择需要的kernel函数。 +- 提供`GetAllCandidateFuncs`方法,根据输入的kernel类别,获取满足要求的所有函数实现。所有实现保证结果一致,但是速度不一致,可以根据具体输入属性大小,动态测试得到当前最优实现,手动选择最优函数。 +- 提供`GetDefaultBestFunc`方法,返回一个默认最优的函数实现。该函数是根据一些通用配置离线tuning之后的结果,能覆盖大多数情况下最优结果。 +- 提供`KernelFuncs::Cache()`方法,该方法会返回默认最优的函数,同时会缓存该函数指针,如果出现属性一致的情况,直接返回上次的函数指针,如果不存在则根据属性新建。 +- 提供`GetReferFunc` 方法,返回该kernel最原始的逻辑函数。该方法与kernel的输入大小和属性没有任何关系,有且并只有一个在CPU上的实现。该方法表征了kernel的原始逻辑,其他所有实现的逻辑与它保持一致。 + +### 例子 + +所有kernel的调用只需要在头文件中包含`"paddle/fluid/operators/jit/kernels.h"`, 该文件是编译时自动生成的。 + +直接从缓存中获取默认最优的函数。 + +```cpp + using T = float; + jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum); + auto seqpool_func = jit::KernelFuncs, platform::CPUPlace>::Cache().At(attr); + seqpool_func(src_data, dst_data, &attr); +``` + +跑一遍所有实现,并输出实现类别。 + +```cpp + using T = float; + jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum); + auto funcs = jit::GetAllCandidateFuncsWithTypes, platform::CPUPlace>(attr); + for (auto f : funcs) { + LOG(INFO) << "Kernel implementation type: " << f.first; + f.second(src_data, dst_data, &attr); + } +``` ## 测试 - 逻辑测试 所有实现都要与refer的code对比,需要满足精度要求, 包括float和double的数据类型 - 性能测试 - 所有实现的性能对比,并且与最终的`jit::Get`方法对比,该方法拿到的性能需要在各种条件下都是最好的。 + 所有实现的性能对比,并且与最终的`jit::GetDefaultBestFunc`方法对比,该方法拿到的性能需要在各种条件下都是最好的。 # 如何添加新的算子 -- 在`KernelType` 中添加 `your_key` . -- 实现Reference 的逻辑,这个是必须是在CPU上的实现,并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel. -- (optional) 实现更多的算法在`more`目录下,可以依赖mkl,intrinsic或者mkldnn等第三方库。 -- (optional) 实现基于Xbyak的生成code,在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`,并注册在与refer相同的`KernelType`上。 -- 必要时可以添加新的`KernelTuples`,可以参考`XYZNTuples`,新加的Attr类型需要特例化`JitCodeKey`方法。 -- 在`test.cc`中添加unit test,至少需要测试`float`和`double`两种数据类型,如有必要需要支持额外的数据类型,比如`int8`的相关函数。 -- 在`benchmark.cc`中添加相应的性能对比,同一种kernel需要对比所有实现,并且确保`jit::Get`得到的实现一直是速度最快的。 +1. 在`KernelType` 中添加 `your_key` 。 +2. 实现Reference 的逻辑,这个是必须是在CPU上的实现,并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel。 +3. (optional) 实现更多的算法在`more`目录下,可以依赖mkl,intrinsic或者mkldnn等第三方库。 +4. (optional) 实现基于Xbyak的生成code,在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`,并注册在与refer相同的`KernelType`上。 +5. 添加新的`KernelTuple`,需要与`KernelType`一一对应,是所有类型的一个打包,包括数据类型,属性的类型,以及返回的函数类型。可以参考`SeqPoolTuple`,新加的Attr类型需要特例化`JitCodeKey`方法。 +6. 在`test.cc`中添加unit test,至少需要测试`float`和`double`两种数据类型,如有必要需要支持额外的数据类型,比如`int8`的相关函数。 +7. 在`benchmark.cc`中添加相应的性能对比,同一种kernel需要对比所有实现,并且确保`GetDefaultBestFunc`得到的实现一直是速度最快的。 # 优点 -- 统一的Get方法,接口简单。 +- 接口方便,灵活调用。 - 同一套逻辑可以有多套实现,可以依赖多套第三方库,互不影响。 - 目录结构清晰,不会在某个文件中有多个宏定义,导致的可读性差问题。 - 优化方便,可以直接针对某种属性针对性优化,并不影响其他属性下的性能。 diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 3088280bb90174e6195a349c07a3435e131e2b33..fbb04a166ef52efd9bd05f27ca656d928d97fb96 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -59,8 +59,6 @@ BenchJITKernel* InsertBenchmark(BenchJITKernel* b) { InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_()); \ void BenchJITKernel_##name##_##dtype##_##place##_::Run() -#define BENCH_FP32_CPU(name) BENCH_JITKERNEL(name, FP32, CPU) - void RUN_ALL_BENCHMARK() { for (auto p : g_all_benchmarks) { if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) { @@ -90,11 +88,11 @@ std::vector TestSizes() { return s; } -template +template struct BenchFunc { // return this function avg time // TODO(TJ): clear cache every time - double operator()(const typename KernelTuples::func_type tgt, Args... args) { + double operator()(const typename KernelTuple::func_type tgt, Args... args) { for (int i = 0; i < FLAGS_burning; ++i) { tgt(args...); } @@ -109,40 +107,17 @@ struct BenchFunc { namespace jit = paddle::operators::jit; -template -void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { - BenchFunc benchmark; +template +void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) { + BenchFunc benchmark; std::vector> infos; - // test refer - auto refer = jit::GetRefer(); - if (!refer) { - LOG(FATAL) << "Refer can not be empty!"; + auto funcs = jit::GetAllCandidateFuncsWithTypes(attr); + for (auto f : funcs) { + infos.push_back(std::make_pair(f.first, benchmark(f.second, args...))); } - infos.push_back(std::make_pair("Refer", benchmark(refer, args...))); - // test jitcode - auto jitcode = jit::GetJitCode(attr); - if (jitcode) { - infos.push_back(std::make_pair("JitCode", benchmark(jitcode, args...))); - } - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); - if (i && i->UseMe(attr)) { - auto more = i->GetFunc(); - infos.push_back( - std::make_pair(i->ImplType(), benchmark(more, args...))); - } - } - } // Test result from Get function - auto tgt = jit::Get(attr); + auto tgt = jit::KernelFuncs::Cache().At(attr); if (!tgt) { LOG(FATAL) << "Target can not be empty!"; } @@ -150,7 +125,8 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { // print std::ostringstream loginfos; - loginfos << "Kernel Type " << jit::to_string(KT) << ": " << attr << ": "; + loginfos << "Kernel Type " << jit::to_string(KernelTuple::kernel_type) << ": " + << attr << ": "; for (auto pair : infos) { loginfos << pair.first << " takes " << pair.second << " us; "; } @@ -159,8 +135,9 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { using Tensor = paddle::framework::Tensor; -template -void BenchXYZNKernel() { +template +void BenchKernelXYZN() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { Tensor x, y, z; x.Resize({d}); @@ -171,16 +148,16 @@ void BenchXYZNKernel() { T* z_data = z.mutable_data(PlaceType()); RandomVec(d, x_data); RandomVec(d, y_data); - BenchAllImpls, PlaceType>(d, x.data(), - y.data(), z_data, d); + BenchAllImpls(d, x.data(), y.data(), z_data, + d); // test inplace - BenchAllImpls, PlaceType>(d, x.data(), z_data, - z_data, d); + BenchAllImpls(d, x.data(), z_data, z_data, d); } } -template -void BenchAXYNKernel() { +template +void BenchKernelAXYN() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { const T a = static_cast(3); Tensor x, y; @@ -189,26 +166,26 @@ void BenchAXYNKernel() { T* x_data = x.mutable_data(PlaceType()); T* y_data = y.mutable_data(PlaceType()); RandomVec(d, x_data); - BenchAllImpls, PlaceType>(d, &a, x.data(), y_data, - d); + BenchAllImpls(d, &a, x.data(), y_data, d); // test inplace - BenchAllImpls, PlaceType>(d, &a, x.data(), x_data, - d); + BenchAllImpls(d, &a, x.data(), x_data, d); } } -template -void BenchXRNKernel() { +template +void BenchKernelXRN() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { Tensor x; RandomVec(d, x.mutable_data({d}, PlaceType())); T res; - BenchAllImpls, PlaceType>(d, x.data(), &res, d); + BenchAllImpls(d, x.data(), &res, d); } } -template -void BenchXYNKernel() { +template +void BenchKernelXYN() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { Tensor x, y; x.Resize({d}); @@ -216,12 +193,13 @@ void BenchXYNKernel() { T* x_data = x.mutable_data(PlaceType()); T* y_data = y.mutable_data(PlaceType()); RandomVec(d, x_data); - BenchAllImpls, PlaceType>(d, x.data(), y_data, d); + BenchAllImpls(d, x.data(), y_data, d); } } -template -void BenchLSTMKernel() { +template +void BenchKernelLSTM() { + using T = typename KernelTuple::data_type; for (bool use_peephole : {true, false}) { for (int d : TestSizes()) { const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh, @@ -252,13 +230,14 @@ void BenchLSTMKernel() { step.wp = wp_data; step.checked = checked_data; } - BenchAllImpls, PlaceType>(attr, &step, &attr); + BenchAllImpls(attr, &step, &attr); } } } -template -void BenchGRUKernel() { +template +void BenchKernelGRU() { + using T = typename KernelTuple::data_type; for (int d : TestSizes()) { const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh); auto place = PlaceType(); @@ -275,12 +254,13 @@ void BenchGRUKernel() { step.gates = x_data; step.ht_1 = ht_1_data; step.ht = ht_data; - BenchAllImpls, PlaceType>(attr, &step, &attr); + BenchAllImpls(attr, &step, &attr); } } -template -void BenchSeqPoolKernel() { +template +void BenchKernelSeqPool() { + using T = typename KernelTuple::data_type; std::vector pool_types = { jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; for (auto type : pool_types) { @@ -294,15 +274,15 @@ void BenchSeqPoolKernel() { RandomVec(h * w, x.mutable_data(PlaceType()), -2.f, 2.f); const T* x_data = x.data(); T* y_data = y.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>(attr, x_data, - y_data, &attr); + BenchAllImpls(attr, x_data, y_data, &attr); } } } } -template -void BenchEmbSeqPoolKernel() { +template +void BenchKernelEmbSeqPool() { + using T = typename KernelTuple::data_type; std::vector pool_types = {jit::SeqPoolType::kSum}; int64_t tbl_h = 1e4; for (int tbl_w : {10, 16, 256}) { @@ -324,16 +304,17 @@ void BenchEmbSeqPoolKernel() { tbl_h - 1); const int64_t* idx_data = idx.data(); T* o_data = out.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>( - attr, table_data, idx_data, o_data, &attr); + BenchAllImpls(attr, table_data, idx_data, + o_data, &attr); } } } } } -template -void BenchSgdKernel() { +template +void BenchKernelSgd() { + using T = typename KernelTuple::data_type; const T lr = 0.1; auto UnDuplicatedRandomVec = [](int n, const int64_t lower, const int64_t upper) -> std::vector { @@ -364,15 +345,16 @@ void BenchSgdKernel() { const T* grad_data = grad.data(); const int64_t* rows_data = rows.data(); jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size); - BenchAllImpls, PlaceType>( - attr, &lr, param_data, grad_data, rows_data, param_data, &attr); + BenchAllImpls(attr, &lr, param_data, grad_data, + rows_data, param_data, &attr); } } } } -template -void BenchMatMulKernel() { +template +void BenchKernelMatMul() { + using T = typename KernelTuple::data_type; for (int m : {1, 2, 3, 4}) { for (int n : TestSizes()) { for (int k : TestSizes()) { @@ -386,15 +368,16 @@ void BenchMatMulKernel() { const T* b_data = b.data(); T* c_data = c.mutable_data(PlaceType()); const jit::matmul_attr_t attr{m, n, k}; - BenchAllImpls, PlaceType>(attr, a_data, b_data, - c_data, &attr); + BenchAllImpls(attr, a_data, b_data, c_data, + &attr); } } } } -template -void BenchSoftmaxKernel() { +template +void BenchKernelSoftmax() { + using T = typename KernelTuple::data_type; for (int bs : {1, 2, 10}) { for (int n : TestSizes()) { Tensor x, y; @@ -403,14 +386,14 @@ void BenchSoftmaxKernel() { RandomVec(bs * n, x.mutable_data(PlaceType()), -2.f, 2.f); const T* x_data = x.data(); T* y_data = y.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>(n, x_data, y_data, n, - bs); + BenchAllImpls(n, x_data, y_data, n, bs); } } } -template -void BenchLayerNormKernel() { +template +void BenchKernelLayerNorm() { + using T = typename KernelTuple::data_type; const T epsilon = 9.99999975e-06; for (int n : {1, 2, 10}) { for (int x_dim_0 : {1, 9, 17, 50}) { @@ -439,16 +422,17 @@ void BenchLayerNormKernel() { T* var_data = var.data(); T* out_data = out.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>( - right, x_data, out_data, mean_data, var_data, scale_data, bias_data, - left, epsilon, right); + BenchAllImpls(right, x_data, out_data, + mean_data, var_data, scale_data, + bias_data, left, epsilon, right); } } } } -template -void BenchCRFDecodingKernel() { +template +void BenchKernelCRFDecoding() { + using T = typename KernelTuple::data_type; constexpr int state_trans_base_idx = 2; for (int seq_len : {1, 11, 17, 50}) { for (int tag_num : TestSizes()) { @@ -468,14 +452,15 @@ void BenchCRFDecodingKernel() { T* alpha_data = alpha.mutable_data(PlaceType()); int* track_data = track.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>( - tag_num, seq_len, x_data, w_data, alpha_data, track_data, tag_num); + BenchAllImpls(tag_num, seq_len, x_data, w_data, + alpha_data, track_data, tag_num); } } } -template -void BenchVBroadcastKernel() { +template +void BenchKernelVBroadcast() { + using T = typename KernelTuple::data_type; for (int64_t w : {1, 16, 64, 100, 256}) { Tensor x; x.Resize({w}); @@ -485,78 +470,86 @@ void BenchVBroadcastKernel() { Tensor y; y.Resize({h * w}); T* y_data = y.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>( - w, x_data, y_data, static_cast(h), w); + BenchAllImpls(w, x_data, y_data, + static_cast(h), w); } } } -using T = float; -using CPUPlace = paddle::platform::CPUPlace; +#define BenchKernelVMul BenchKernelXYZN +#define BenchKernelVAdd BenchKernelXYZN +#define BenchKernelVAddRelu BenchKernelXYZN +#define BenchKernelVSub BenchKernelXYZN -// xyzn -BENCH_FP32_CPU(kVMul) { BenchXYZNKernel(); } -BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel(); } -BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel(); } -BENCH_FP32_CPU(kVSub) { BenchXYZNKernel(); } +#define BenchKernelVScal BenchKernelAXYN +#define BenchKernelVAddBias BenchKernelAXYN -// axyn -BENCH_FP32_CPU(kVScal) { BenchAXYNKernel(); } -BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel(); } +#define BenchKernelVRelu BenchKernelXYN +#define BenchKernelVIdentity BenchKernelXYN +#define BenchKernelVSquare BenchKernelXYN +#define BenchKernelVExp BenchKernelXYN +#define BenchKernelVSigmoid BenchKernelXYN +#define BenchKernelVTanh BenchKernelXYN +#define BenchKernelVCopy BenchKernelXYN -// xrn -BENCH_FP32_CPU(kHSum) { BenchXRNKernel(); } -BENCH_FP32_CPU(kHMax) { BenchXRNKernel(); } +#define BenchKernelHMax BenchKernelXRN +#define BenchKernelHSum BenchKernelXRN -// xyn -BENCH_FP32_CPU(kVRelu) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVSquare) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVExp) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVTanh) { BenchXYNKernel(); } -BENCH_FP32_CPU(kVCopy) { BenchXYNKernel(); } - -// lstm and peephole -BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel(); } -BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel(); } - -// gru functions -BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel(); } -BENCH_FP32_CPU(kGRUHtPart1) { BenchGRUKernel(); } -BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel(); } - -// seq pool function -BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel(); } - -// embedding seq pool function -BENCH_FP32_CPU(kEmbSeqPool) { - BenchEmbSeqPoolKernel(); -} +#define BenchKernelLSTMCtHt BenchKernelLSTM +#define BenchKernelLSTMC1H1 BenchKernelLSTM -// sgd function -BENCH_FP32_CPU(kSgd) { BenchSgdKernel(); } +#define BenchKernelGRUH1 BenchKernelGRU +#define BenchKernelGRUHtPart1 BenchKernelGRU +#define BenchKernelGRUHtPart2 BenchKernelGRU -// matmul -BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel(); } +using CPUPlace = paddle::platform::CPUPlace; -// softmax -BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel(); } +#define BENCH_FP32_CPU(name) \ + BENCH_JITKERNEL(name, FP32, CPU) { \ + BenchKernel##name, CPUPlace>(); \ + } -// layernorm -BENCH_FP32_CPU(kLayerNorm) { - BenchLayerNormKernel(); -} +// xyzn +BENCH_FP32_CPU(VMul); +BENCH_FP32_CPU(VAdd); +BENCH_FP32_CPU(VAddRelu); +BENCH_FP32_CPU(VSub); -// crfdecoding -BENCH_FP32_CPU(kCRFDecoding) { - BenchCRFDecodingKernel(); -} +// axyn +BENCH_FP32_CPU(VScal); +BENCH_FP32_CPU(VAddBias); -// vbroadcast function -BENCH_FP32_CPU(kVBroadcast) { - BenchVBroadcastKernel(); -} +// xyn +BENCH_FP32_CPU(VRelu); +BENCH_FP32_CPU(VIdentity); +BENCH_FP32_CPU(VSquare); +BENCH_FP32_CPU(VExp); +BENCH_FP32_CPU(VSigmoid); +BENCH_FP32_CPU(VTanh); +BENCH_FP32_CPU(VCopy); + +// xrn +BENCH_FP32_CPU(HMax); +BENCH_FP32_CPU(HSum); + +// LSTM +BENCH_FP32_CPU(LSTMCtHt); +BENCH_FP32_CPU(LSTMC1H1); + +// GRU +BENCH_FP32_CPU(GRUH1); +BENCH_FP32_CPU(GRUHtPart1); +BENCH_FP32_CPU(GRUHtPart2); + +BENCH_FP32_CPU(LayerNorm); +BENCH_FP32_CPU(CRFDecoding); + +BENCH_FP32_CPU(SeqPool); +BENCH_FP32_CPU(EmbSeqPool); +BENCH_FP32_CPU(MatMul); +BENCH_FP32_CPU(Softmax); +BENCH_FP32_CPU(Sgd); +BENCH_FP32_CPU(VBroadcast); // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc index e7a7375879064eb27c94315fe7b93eece7866b92..ad68e792c7a8ec4fb600a5b04153ad45895d761a 100644 --- a/paddle/fluid/operators/jit/gen/act.cc +++ b/paddle/fluid/operators/jit/gen/act.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/act.h" +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -81,7 +82,7 @@ void VActJitCode::genCode() { #define DECLARE_ACT_CREATOR(name) \ class name##Creator : public JitCodeCreator { \ public: \ - bool UseMe(const int& attr) const override; \ + bool CanBeUsed(const int& attr) const override; \ size_t CodeSize(const int& d) const override; \ std::unique_ptr CreateJitCode(const int& attr) const override { \ return make_unique(attr, CodeSize(attr)); \ @@ -96,27 +97,27 @@ DECLARE_ACT_CREATOR(VSigmoid); DECLARE_ACT_CREATOR(VTanh); // TODO(TJ): tuning use me -bool VReluCreator::UseMe(const int& d) const { +bool VReluCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } -bool VSquareCreator::UseMe(const int& d) const { +bool VSquareCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } -bool VIdentityCreator::UseMe(const int& d) const { +bool VIdentityCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } -bool VExpCreator::UseMe(const int& d) const { +bool VExpCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx) && d < 32; } -bool VSigmoidCreator::UseMe(const int& d) const { +bool VSigmoidCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } -bool VTanhCreator::UseMe(const int& d) const { +bool VTanhCreator::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx); } diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc index 5da24c359edd2df93333fe0ca8a18cdc7385aadb..c126b9077ae50f528074210bae39227a9fcd3277 100644 --- a/paddle/fluid/operators/jit/gen/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/blas.h" +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -142,7 +143,7 @@ void NCHW16CMulNCJitCode::genCode() { class NCHW16CMulNCCreator : public JitCodeCreator { public: - bool UseMe(const int& attr) const override { + bool CanBeUsed(const int& attr) const override { return platform::MayIUse(platform::avx512f); } size_t CodeSize(const int& d) const override { return 256 * 1024; } @@ -154,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator { #define DECLARE_BLAS_CREATOR(name) \ class name##Creator : public JitCodeCreator { \ public: \ - bool UseMe(const int& attr) const override { \ + bool CanBeUsed(const int& attr) const override { \ return platform::MayIUse(platform::avx) && attr <= 1024; \ } \ size_t CodeSize(const int& d) const override { \ diff --git a/paddle/fluid/operators/jit/gen/embseqpool.cc b/paddle/fluid/operators/jit/gen/embseqpool.cc index 23837a3fb9886ae8a839d4b31bd57916168ea53c..331a4b0d0753b37843c3d112256abfbabe9a4913 100644 --- a/paddle/fluid/operators/jit/gen/embseqpool.cc +++ b/paddle/fluid/operators/jit/gen/embseqpool.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/jit/gen/embseqpool.h" #include // offsetof +#include #include #include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones #include "paddle/fluid/operators/jit/registry.h" @@ -121,7 +122,7 @@ void EmbSeqPoolJitCode::genCode() { class EmbSeqPoolCreator : public JitCodeCreator { public: - bool UseMe(const emb_seq_pool_attr_t& attr) const override { + bool CanBeUsed(const emb_seq_pool_attr_t& attr) const override { return platform::MayIUse(platform::avx) && attr.table_width % YMM_FLOAT_BLOCK == 0; } diff --git a/paddle/fluid/operators/jit/gen/gru.cc b/paddle/fluid/operators/jit/gen/gru.cc index 13f7a14111a80632a06c7fc632da47c0802828f7..b5b0cffa80612c61829766027013f172962b5069 100644 --- a/paddle/fluid/operators/jit/gen/gru.cc +++ b/paddle/fluid/operators/jit/gen/gru.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/jit/gen/gru.h" #include // offsetof +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -86,7 +87,7 @@ void GRUJitCode::genCode() { class name##Creator : public JitCodeCreator { \ public: \ /* TODO(TJ): enable more */ \ - bool UseMe(const gru_attr_t& attr) const override { \ + bool CanBeUsed(const gru_attr_t& attr) const override { \ return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \ } \ size_t CodeSize(const gru_attr_t& attr) const override { \ diff --git a/paddle/fluid/operators/jit/gen/hopv.cc b/paddle/fluid/operators/jit/gen/hopv.cc index e7884017198623d996fe98a55691da6e342d656a..462ac68a932e14b1200d503a937a35454c0e0618 100644 --- a/paddle/fluid/operators/jit/gen/hopv.cc +++ b/paddle/fluid/operators/jit/gen/hopv.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/hopv.h" +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -76,7 +77,7 @@ void HOPVJitCode::genCode() { #define DECLARE_HOP_CREATOR(name) \ class name##Creator : public JitCodeCreator { \ public: \ - bool UseMe(const int& attr) const override { \ + bool CanBeUsed(const int& attr) const override { \ return platform::MayIUse(platform::avx); \ } \ size_t CodeSize(const int& d) const override { \ diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index 39847d1b65f771976c4dde5a3e34cc40e33851e6..228db7cc721099750da30adeaa828ae31f521422 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -73,7 +73,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator { virtual void genCode() = 0; size_t getSize() const override { return CodeGenerator::getSize(); } - const unsigned char* getCodeInternal() override { + const unsigned char* getCodeInternal() const override { const Xbyak::uint8* code = CodeGenerator::getCode(); return code; } diff --git a/paddle/fluid/operators/jit/gen/lstm.cc b/paddle/fluid/operators/jit/gen/lstm.cc index 08bafb5a81882072129a4bfa86d5aff2d33a79a1..2c3bc985e9a8b224835d848d30e0a3ef641ed2f9 100644 --- a/paddle/fluid/operators/jit/gen/lstm.cc +++ b/paddle/fluid/operators/jit/gen/lstm.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/jit/gen/lstm.h" #include // offsetof +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -114,7 +115,7 @@ void LSTMJitCode::genCode() { class name##Creator : public JitCodeCreator { \ public: \ /* TODO(TJ): enable more */ \ - bool UseMe(const lstm_attr_t& attr) const override { \ + bool CanBeUsed(const lstm_attr_t& attr) const override { \ return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \ } \ size_t CodeSize(const lstm_attr_t& attr) const override { \ diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc index ae3858eab20aeb80553d8fcec4088a6632c9c17d..d9955c8cc655f86bbc6c8135bdfa6c83493727f2 100644 --- a/paddle/fluid/operators/jit/gen/matmul.cc +++ b/paddle/fluid/operators/jit/gen/matmul.cc @@ -14,8 +14,8 @@ #include "paddle/fluid/operators/jit/gen/matmul.h" #include // offsetof +#include #include - #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -98,7 +98,7 @@ void MatMulJitCode::genCode() { class MatMulCreator : public JitCodeCreator { public: - bool UseMe(const matmul_attr_t& attr) const override { + bool CanBeUsed(const matmul_attr_t& attr) const override { return attr.m == 1 && platform::MayIUse(platform::avx512f) && attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512; } diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc index 530d24ee1fb7d9da84102641e1d4d2ab08ab1860..d9e5904add4486ddf126093865f7e0571c1909e4 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.cc +++ b/paddle/fluid/operators/jit/gen/seqpool.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/gen/seqpool.h" +#include #include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -57,7 +58,7 @@ void SeqPoolJitCode::genCode() { class SeqPoolCreator : public JitCodeCreator { public: - bool UseMe(const seq_pool_attr_t& attr) const override { + bool CanBeUsed(const seq_pool_attr_t& attr) const override { return platform::MayIUse(platform::avx); } size_t CodeSize(const seq_pool_attr_t& attr) const override { diff --git a/paddle/fluid/operators/jit/gen/sgd.cc b/paddle/fluid/operators/jit/gen/sgd.cc index a745a27f9543a75f6915c9316aad62fa41305bb1..e65d3500b496c811b2da39752417ce5ef3ab3914 100644 --- a/paddle/fluid/operators/jit/gen/sgd.cc +++ b/paddle/fluid/operators/jit/gen/sgd.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/operators/jit/gen/sgd.h" #include // offsetof +#include #include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -104,7 +105,7 @@ void SgdJitCode::genCode() { class SgdCreator : public JitCodeCreator { public: - bool UseMe(const sgd_attr_t& attr) const override { + bool CanBeUsed(const sgd_attr_t& attr) const override { return platform::MayIUse(platform::avx) && attr.grad_width % YMM_FLOAT_BLOCK == 0; } diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.cc b/paddle/fluid/operators/jit/gen/vbroadcast.cc index 3f9fbdbd821acae0940c5a7b8d9a5eb2432712ff..66a8d75fd4de5bae3ba37cf7fe7b1645938aa855 100644 --- a/paddle/fluid/operators/jit/gen/vbroadcast.cc +++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc @@ -69,7 +69,7 @@ void VBroadcastJitCode::genCode() { class VBroadcastCreator : public JitCodeCreator { public: - bool UseMe(const int64_t& w) const override { + bool CanBeUsed(const int64_t& w) const override { return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0; } size_t CodeSize(const int64_t& w) const override { diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc index f3603875ad7bda1fc688f9c053e0d37f7bb31f02..4c49eff49e3efc0664a084f9fa2bb897db0c6f1d 100644 --- a/paddle/fluid/operators/jit/gen_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -31,7 +31,7 @@ namespace paddle { namespace operators { namespace jit { -// refer do not need useme, it would be the last one. +// refer do not need CanBeUsed, it would be the last one. void GenBase::dumpCode(const unsigned char* code) const { if (code) { static int counter = 0; diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h index a7c7a35a7ea35bd80333b04f001d4ab5b5d1e06b..033c603c07c288ba621ceaa912ea0c476fe86cd6 100644 --- a/paddle/fluid/operators/jit/gen_base.h +++ b/paddle/fluid/operators/jit/gen_base.h @@ -31,9 +31,10 @@ class GenBase : public Kernel { virtual ~GenBase() = default; virtual std::string name() const = 0; virtual size_t getSize() const = 0; - virtual const unsigned char* getCodeInternal() = 0; + virtual const unsigned char* getCodeInternal() const = 0; + const char* ImplType() const override { return "JitCode"; } template - Func getCode() { + Func getCode() const { const unsigned char* code = this->getCodeInternal(); if (FLAGS_dump_jitcode) { this->dumpCode(code); @@ -65,7 +66,7 @@ class JitCodeCreator : public GenCreator { virtual ~JitCodeCreator() = default; // condition when this jit code can be used. - virtual bool UseMe(const Attr& attr) const = 0; + virtual bool CanBeUsed(const Attr& attr) const = 0; // estimate this code size virtual size_t CodeSize(const Attr& attr) const = 0; diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index d85c719c1c58c88ec244f1f6ad8343d66391241d..1ac5318d461c2e8bc4f43569602a88f95a76befb 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -16,6 +16,8 @@ #include #include +#include +#include // for std::move #include #include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/operators/jit/kernel_base.h" @@ -27,35 +29,34 @@ namespace paddle { namespace operators { namespace jit { -template +template inline typename std::enable_if< - std::is_same::value && + std::is_same::value && std::is_same::value, - typename KernelTuples::func_type>::type -GetJitCode(const typename KernelTuples::attr_type& attr) { - using Func = typename KernelTuples::func_type; - using Attr = typename KernelTuples::attr_type; - size_t key = JitCodeKey(attr); - auto& codes = JitCodePool().Instance(); + const Kernel*>::type +GetJitCode(const typename KernelTuple::attr_type& attr) { + using Attr = typename KernelTuple::attr_type; + int64_t key = JitCodeKey(attr); + auto& codes = JitCodePool::Instance(); if (codes.Has(key)) { - return codes.AllKernels().at(key)->template getCode(); + return codes.AllKernels().at(key).get(); } // creator is not related with attr, so can use KernelKey as key - KernelKey kkey(KT, PlaceType()); + KernelKey kkey(KernelTuple::kernel_type, PlaceType()); // pool: (KernelKey(type, place), vector) - auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); + auto& creator_map = JitCodeCreatorPool::Instance().AllCreators(); auto iter = creator_map.find(kkey); if (iter != creator_map.end()) { auto& creators = iter->second; for (auto& cur : creators) { auto i = dynamic_cast*>(cur.get()); - if (i && i->UseMe(attr)) { + if (i && i->CanBeUsed(attr)) { auto p = i->CreateJitCode(attr); if (p) { - auto f = p->template getCode(); + auto res = p.get(); codes.Insert(key, std::move(p)); - return f; + return res; } } } @@ -63,87 +64,153 @@ GetJitCode(const typename KernelTuples::attr_type& attr) { return nullptr; } -template +template inline typename std::enable_if< - !std::is_same::value || + !std::is_same::value || !std::is_same::value, - typename KernelTuples::func_type>::type -GetJitCode(const typename KernelTuples::attr_type& attr) { + const Kernel*>::type +GetJitCode(const typename KernelTuple::attr_type& attr) { return nullptr; } // Refer code do not related with attr, which is just for cast // Refer is always on CPUPlace -template -inline typename KernelTuples::func_type GetRefer() { - auto& ref_pool = ReferKernelPool().Instance().AllKernels(); - KernelKey kkey(KT, platform::CPUPlace()); +template +inline const Kernel* GetReferKernel() { + auto& ref_pool = ReferKernelPool::Instance().AllKernels(); + KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace()); auto ref_iter = ref_pool.find(kkey); PADDLE_ENFORCE(ref_iter != ref_pool.end(), "Every Kernel should have reference function."); auto& ref_impls = ref_iter->second; for (auto& impl : ref_impls) { - auto i = dynamic_cast*>(impl.get()); + auto i = dynamic_cast*>(impl.get()); if (i) { - return i->GetFunc(); + return i; } } return nullptr; } -template -typename KernelTuples::func_type Get( - const typename KernelTuples::attr_type& attr) { - auto jitfunc = GetJitCode(attr); - if (jitfunc) { - return jitfunc; +template +inline typename KernelTuple::func_type GetReferFunc() { + auto ker = GetReferKernel(); + auto p = dynamic_cast*>(ker); + PADDLE_ENFORCE(p, "The Refer kernel should exsit"); + return p->GetFunc(); +} + +// Return all Kernels that can be used +template +std::vector GetAllCandidateKernels( + const typename KernelTuple::attr_type& attr) { + // the search order shoudl be jitcode > more > refer + std::vector res; + auto jitker = GetJitCode(attr); + if (jitker) { + res.emplace_back(jitker); } - // pool: (KernelKey(type, place), vector) - KernelKey kkey(KT, PlaceType()); - auto& pool = KernelPool().Instance().AllKernels(); + // more kernelpool: (KernelKey(type, place), vector) + KernelKey kkey(KernelTuple::kernel_type, PlaceType()); + auto& pool = KernelPool::Instance().AllKernels(); auto iter = pool.find(kkey); if (iter != pool.end()) { auto& impls = iter->second; for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); - if (i && i->UseMe(attr)) { - return i->GetFunc(); + auto i = dynamic_cast*>(impl.get()); + if (i && i->CanBeUsed(attr)) { + res.emplace_back(i); } } } // The last implementation should be reference function on CPUPlace. - return GetRefer(); + auto ref = GetReferKernel(); + PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty."); + res.emplace_back(ref); + return res; +} + +template +std::vector> +GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) { + using Func = typename KernelTuple::func_type; + auto kers = GetAllCandidateKernels(attr); + std::vector> res; + for (auto k : kers) { + std::string name = k->ImplType(); + if (name == "JitCode") { + auto i = dynamic_cast(k); + PADDLE_ENFORCE(i, "jitcode kernel cast can not fail."); + res.emplace_back(std::make_pair(name, i->template getCode())); + } else { + auto i = dynamic_cast*>(k); + PADDLE_ENFORCE(i, "kernel cast can not fail."); + res.emplace_back(std::make_pair(name, i->GetFunc())); + } + } + return res; +} + +template +std::vector GetAllCandidateFuncs( + const typename KernelTuple::attr_type& attr) { + auto funcs = GetAllCandidateFuncsWithTypes(attr); + std::vector res; + for (auto& i : funcs) { + res.emplace_back(i.second); + } + return res; +} + +template +typename KernelTuple::func_type GetDefaultBestFunc( + const typename KernelTuple::attr_type& attr) { + auto funcs = GetAllCandidateFuncs(attr); + PADDLE_ENFORCE_GE(funcs.size(), 1UL); + // Here could do some runtime benchmark of this attr and return the best one. + // But yet just get the first one as the default best one, + // which is searched in order and tuned by offline. + return funcs[0]; } -template +template class KernelFuncs { public: KernelFuncs() = default; static KernelFuncs& Cache() { - static thread_local KernelFuncs g_func_cache; + static thread_local KernelFuncs g_func_cache; return g_func_cache; } - bool Has(int key) const { return funcs_.find(key) != funcs_.end(); } - - void Insert(int key, typename KernelTuples::func_type func) { - funcs_.emplace(key, func); - } - - typename KernelTuples::func_type At(int key) { + // the exposed interface to use + typename KernelTuple::func_type At( + const typename KernelTuple::attr_type& attr) { + // Maybe here is not good enough, not all kernels should have jitcode + int64_t key = JitCodeKey(attr); if (Has(key)) { return funcs_.at(key); } - auto func = Get(key); + // If do not have this attr in cache then get the default best + auto func = GetDefaultBestFunc(attr); Insert(key, func); return func; } + typename KernelTuple::func_type operator[]( + const typename KernelTuple::attr_type& attr) { + return At(attr); + } + + protected: + bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); } + void Insert(int64_t key, typename KernelTuple::func_type func) { + funcs_.emplace(key, func); + } + private: - std::unordered_map funcs_; + std::unordered_map funcs_; DISABLE_COPY_AND_ASSIGN(KernelFuncs); }; diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 96e162a21bff2a5624f35ada615c9a9a17ad3c75..bd34d7dfc72a139e70983c56c3220bd01d572bcd 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -62,26 +62,55 @@ typedef enum { kSqrt, } SeqPoolType; +// x, y, z, n template -struct XYZNTuples { +struct XYZNTuple { typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, const T*, T*, int); }; +// a, x, y, n template -struct AXYNTuples : public XYZNTuples {}; +struct AXYNTuple : public XYZNTuple {}; +// x, y, n template -struct XYNTuples { +struct XYNTuple { typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, T*, int); }; -// x, return and int +// x, returned value, n template -struct XRNTuples : public XYNTuples {}; +struct XRNTuple : public XYNTuple {}; + +#define DECLARE_KERNELTUPLE(kernel_tuple, type) \ + template \ + struct type##Tuple : public kernel_tuple { \ + static constexpr KernelType kernel_type = k##type; \ + } + +// Tuple should be corresponding to the KernelType +DECLARE_KERNELTUPLE(XYZNTuple, VMul); +DECLARE_KERNELTUPLE(XYZNTuple, VAdd); +DECLARE_KERNELTUPLE(XYZNTuple, VAddRelu); +DECLARE_KERNELTUPLE(XYZNTuple, VSub); + +DECLARE_KERNELTUPLE(AXYNTuple, VScal); +DECLARE_KERNELTUPLE(AXYNTuple, VAddBias); + +DECLARE_KERNELTUPLE(XYNTuple, VRelu); +DECLARE_KERNELTUPLE(XYNTuple, VIdentity); +DECLARE_KERNELTUPLE(XYNTuple, VSquare); +DECLARE_KERNELTUPLE(XYNTuple, VExp); +DECLARE_KERNELTUPLE(XYNTuple, VSigmoid); +DECLARE_KERNELTUPLE(XYNTuple, VTanh); +DECLARE_KERNELTUPLE(XYNTuple, VCopy); + +DECLARE_KERNELTUPLE(XRNTuple, HMax); +DECLARE_KERNELTUPLE(XRNTuple, HSum); typedef struct { void* gates; // gates: x_ch, x_ih, x_fh, x_oh @@ -122,21 +151,31 @@ typedef struct rnn_attr_s gru_attr_t; typedef struct lstm_attr_s lstm_attr_t; template -struct LSTMTuples { +struct LSTMTuple { typedef T data_type; typedef lstm_attr_t attr_type; typedef void (*func_type)(lstm_t*, const lstm_attr_t*); }; template -struct GRUTuples { +struct GRUTuple { typedef T data_type; typedef gru_attr_t attr_type; typedef void (*func_type)(gru_t*, const gru_attr_t*); }; +DECLARE_KERNELTUPLE(LSTMTuple, LSTMCtHt); +DECLARE_KERNELTUPLE(LSTMTuple, LSTMC1H1); + +DECLARE_KERNELTUPLE(GRUTuple, GRUH1); +DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart1); +DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart2); + +#undef DECLARE_KERNELTUPLE + template -struct VBroadcastTuples { +struct VBroadcastTuple { + static constexpr KernelType kernel_type = kVBroadcast; typedef T data_type; typedef int64_t attr_type; typedef void (*func_type)(const T*, T*, int64_t, int64_t); @@ -151,7 +190,8 @@ typedef struct seq_pool_attr_s { } seq_pool_attr_t; template -struct SeqPoolTuples { +struct SeqPoolTuple { + static constexpr KernelType kernel_type = kSeqPool; typedef T data_type; typedef seq_pool_attr_t attr_type; typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*); @@ -176,7 +216,8 @@ typedef struct emb_seq_pool_attr_s { } emb_seq_pool_attr_t; template -struct EmbSeqPoolTuples { +struct EmbSeqPoolTuple { + static constexpr KernelType kernel_type = kEmbSeqPool; typedef T data_type; typedef emb_seq_pool_attr_t attr_type; typedef void (*func_type)(const T*, const int64_t*, T*, @@ -198,7 +239,8 @@ typedef struct sgd_attr_s { } sgd_attr_t; template -struct SgdTuples { +struct SgdTuple { + static constexpr KernelType kernel_type = kSgd; typedef T data_type; typedef sgd_attr_t attr_type; typedef void (*func_type)(const T*, const T*, const T*, const int64_t*, T*, @@ -214,21 +256,24 @@ typedef struct matmul_attr_s { } matmul_attr_t; template -struct MatMulTuples { +struct MatMulTuple { + static constexpr KernelType kernel_type = kMatMul; typedef T data_type; typedef matmul_attr_t attr_type; typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*); }; template -struct CRFDecodingTuples { +struct CRFDecodingTuple { + static constexpr KernelType kernel_type = kCRFDecoding; typedef T data_type; typedef int attr_type; typedef void (*func_type)(const int, const T*, const T*, T*, int*, int); }; template -struct LayerNormTuples { +struct LayerNormTuple { + static constexpr KernelType kernel_type = kLayerNorm; typedef T data_type; typedef int attr_type; typedef void (*func_type)(T*, T*, T*, T*, const T*, const T*, int, @@ -236,7 +281,8 @@ struct LayerNormTuples { }; template -struct SoftmaxTuples { +struct SoftmaxTuple { + static constexpr KernelType kernel_type = kSoftmax; typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, T*, int, int); @@ -244,7 +290,8 @@ struct SoftmaxTuples { // nChw16c = nChw16c .* NC template -struct NCHW16CMulNCTuples { +struct NCHW16CMulNCTuple { + static constexpr KernelType kernel_type = kNCHW16CMulNC; typedef T data_type; typedef int attr_type; typedef void (*func_type)(const T*, const T*, T*, int, int); @@ -255,28 +302,29 @@ class Kernel { public: Kernel() = default; virtual ~Kernel() = default; + virtual const char* ImplType() const = 0; DISABLE_COPY_AND_ASSIGN(Kernel); }; -template +template class KernelMore : public Kernel { public: - using T = typename KernelTuples::data_type; - using Func = typename KernelTuples::func_type; - using Attr = typename KernelTuples::attr_type; + using T = typename KernelTuple::data_type; + using Func = typename KernelTuple::func_type; + using Attr = typename KernelTuple::attr_type; virtual Func GetFunc() const { return func; } - virtual bool UseMe(const Attr& attr) const = 0; - virtual const char* ImplType() const = 0; + // specify this kernel can be used, means it should not fail if use it. + virtual bool CanBeUsed(const Attr& attr) const = 0; protected: Func func{nullptr}; }; -template -class ReferKernel : public KernelMore { +template +class ReferKernel : public KernelMore { public: // Refer code can always be used - bool UseMe(const typename KernelTuples::attr_type& attr) const override { + bool CanBeUsed(const typename KernelTuple::attr_type& attr) const override { return true; } const char* ImplType() const override { return "Refer"; } diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc index 1c2fddcae79d8b89e1169d5bcb364b3ff2e42dd3..1ad220b3972a3d3920610ab8f7ea416892a80d22 100644 --- a/paddle/fluid/operators/jit/kernel_key.cc +++ b/paddle/fluid/operators/jit/kernel_key.cc @@ -13,6 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/kernel_key.h" +#include // XXH64: 13.8 GB/s #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -20,71 +21,46 @@ namespace operators { namespace jit { template <> -size_t JitCodeKey(const int& d) { +int64_t JitCodeKey(const int& d) { return d; } template <> -size_t JitCodeKey(const int64_t& d) { +int64_t JitCodeKey(const int64_t& d) { return d; } -// TODO(TJ): refine and benchmark JitCodeKey generatation -constexpr int act_type_shift = 3; // suppot 2^3 act types -static inline int act_type_convert(KernelType type) { - if (type == kVIdentity) { - return 0; - } else if (type == kVExp) { - return 1; - } else if (type == kVRelu) { - return 2; - } else if (type == kVSigmoid) { - return 3; - } else if (type == kVTanh) { - return 4; - } - PADDLE_THROW("Unsupported act type %d", type); - return 0; -} - template <> -size_t JitCodeKey(const lstm_attr_t& attr) { - size_t key = attr.d; - int gate_key = act_type_convert(attr.act_gate) << 1; - int cand_key = act_type_convert(attr.act_cand) << (1 + act_type_shift); - int cell_key = act_type_convert(attr.act_cell) << (1 + act_type_shift * 2); - return (key << (1 + act_type_shift * 3)) + gate_key + cand_key + cell_key + - attr.use_peephole; +int64_t JitCodeKey(const gru_attr_t& attr) { + return XXH64(&attr, sizeof(gru_attr_t), 0); } template <> -size_t JitCodeKey(const gru_attr_t& attr) { - size_t key = attr.d; - return (key << (act_type_shift * 2)) + act_type_convert(attr.act_gate) + - (act_type_convert(attr.act_cand) << act_type_shift); +int64_t JitCodeKey(const lstm_attr_t& attr) { + int keys[5] = { + attr.d, static_cast(attr.act_gate), static_cast(attr.act_cand), + static_cast(attr.act_cell), static_cast(attr.use_peephole)}; + return XXH64(keys, sizeof(int) * 5, 0); } template <> -size_t JitCodeKey(const seq_pool_attr_t& attr) { - size_t key = attr.w; - constexpr int pool_type_shift = 3; - return (key << pool_type_shift) + static_cast(attr.type); +int64_t JitCodeKey(const seq_pool_attr_t& attr) { + int keys[2] = {attr.w, static_cast(attr.type)}; + return XXH64(keys, sizeof(int) * 2, 0); } template <> -size_t JitCodeKey(const matmul_attr_t& attr) { - size_t key = attr.m; - constexpr int shift = 21; - return (key << shift * 2) + ((static_cast(attr.n)) << shift) + attr.k; +int64_t JitCodeKey(const matmul_attr_t& attr) { + return XXH64(&attr, sizeof(int) * 3, 0); // m, n, k } template <> -size_t JitCodeKey(const emb_seq_pool_attr_t& attr) { +int64_t JitCodeKey(const emb_seq_pool_attr_t& attr) { return attr.table_width; } template <> -size_t JitCodeKey(const sgd_attr_t& attr) { +int64_t JitCodeKey(const sgd_attr_t& attr) { return attr.grad_width; } diff --git a/paddle/fluid/operators/jit/kernel_key.h b/paddle/fluid/operators/jit/kernel_key.h index 611a0210d614196ad0b05d583303688c1d964e04..b2cf92f23e8ccff5fff7c6e193f7118fbb4765f0 100644 --- a/paddle/fluid/operators/jit/kernel_key.h +++ b/paddle/fluid/operators/jit/kernel_key.h @@ -46,7 +46,7 @@ struct KernelKey { // Every JitCode should have a method to get the key from attribution template -size_t JitCodeKey(const Attr& attr); +int64_t JitCodeKey(const Attr& attr); } // namespace jit } // namespace operators diff --git a/paddle/fluid/operators/jit/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h index 3e15242af28839ee0759e1a5b3930d6d6bfaa0ff..04710a54ac9ddf2ecb8f6a1f2ca33ef158d2d73f 100644 --- a/paddle/fluid/operators/jit/kernel_pool.h +++ b/paddle/fluid/operators/jit/kernel_pool.h @@ -17,6 +17,7 @@ #include // for unique_ptr #include #include +#include // for move #include #include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/operators/jit/kernel_base.h" @@ -30,7 +31,7 @@ namespace jit { template class JitCodePool { typedef std::unique_ptr GenBasePtr; - typedef std::unordered_map JitCodeMap; + typedef std::unordered_map JitCodeMap; public: JitCodePool() = default; @@ -41,9 +42,9 @@ class JitCodePool { const JitCodeMap& AllKernels() { return codes_; } - bool Has(size_t key) const { return codes_.find(key) != codes_.end(); } + bool Has(int64_t key) const { return codes_.find(key) != codes_.end(); } - void Insert(size_t key, GenBasePtr value) { + void Insert(int64_t key, GenBasePtr value) { codes_.emplace(key, std::move(value)); } diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc index 16c91f8246dda34b1436fd4edd507e9ff603de6b..1254d00189a315b4f743f48e56b3eb53c92ec694 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc @@ -161,7 +161,7 @@ void CRFDecoding(const int seq_len, const float* x, const float* w, } } -bool CRFDecodingKernel::UseMe(const int& d) const { +bool CRFDecodingKernel::CanBeUsed(const int& d) const { #ifdef __AVX512F__ constexpr int block = ZMM_FLOAT_BLOCK; #else diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h index 24179d90ddcc6e7f44ffa4b2ca0886fbca5c81bf..49b1a1fea4b16f435120bb37c7d9c8c07a4cc4f5 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h @@ -26,11 +26,11 @@ namespace intrinsic { void CRFDecoding(const int seq_len, const float* x, const float* w, float* alpha, int* track, int tag_num); -class CRFDecodingKernel : public KernelMore> { +class CRFDecodingKernel : public KernelMore> { public: CRFDecodingKernel() { this->func = CRFDecoding; } - bool UseMe( - const typename CRFDecodingTuples::attr_type&) const override; + bool CanBeUsed( + const typename CRFDecodingTuple::attr_type&) const override; const char* ImplType() const override { return "Intrinsic"; } }; diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc index e9b6e401c6825b21191881d4e57fe09b48d2f4ee..a4e3246f10495b67871c08fd8cb7ccd1cf085c9e 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc +++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc @@ -153,7 +153,7 @@ void LayerNorm(float* x, float* out, float* mean, float* var, } } -bool LayerNormKernel::UseMe(const int& d) const { +bool LayerNormKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx) && d >= YMM_FLOAT_BLOCK; } diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h index 89da2940f4420c418f9bd5260c4b74606cc9168f..7b9f676050d806314edd1e46611416a8b7170add 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h +++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h @@ -27,10 +27,11 @@ void LayerNorm(float* x, float* out, float* mean, float* var, const float* scale, const float* bias, int height, const float epsilon, int right); -class LayerNormKernel : public KernelMore> { +class LayerNormKernel : public KernelMore> { public: LayerNormKernel() { this->func = LayerNorm; } - bool UseMe(const typename LayerNormTuples::attr_type&) const override; + bool CanBeUsed( + const typename LayerNormTuple::attr_type&) const override; const char* ImplType() const override { return "Intrinsic"; } }; diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index 0036d1c238b17768c4df61af22a85588990e1815..6e709a16d232e2fa1a77e74e228b763fed4dd75b 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -23,6 +23,8 @@ namespace jit { namespace more { namespace mix { +using CPUPlace = platform::CPUPlace; + void VSigmoid(const T* x, T* y, int n) { const float min = SIGMOID_THRESHOLD_MIN; const float max = SIGMOID_THRESHOLD_MAX; @@ -30,7 +32,7 @@ void VSigmoid(const T* x, T* y, int n) { y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); y[i] = static_cast(0) - y[i]; } - auto compute = Get, platform::CPUPlace>(n); + auto compute = KernelFuncs, CPUPlace>::Cache().At(n); compute(y, y, n); for (int i = 0; i < n; ++i) { y[i] = static_cast(1) / (static_cast(1) + y[i]); @@ -39,9 +41,9 @@ void VSigmoid(const T* x, T* y, int n) { void VTanh(const T* x, T* y, int n) { const T a = 2, b = -1; - auto compute_scal = Get, platform::CPUPlace>(n); - auto compute_addbias = Get, platform::CPUPlace>(n); - auto compute_sigmoid = Get, platform::CPUPlace>(n); + auto compute_scal = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_addbias = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_sigmoid = KernelFuncs, CPUPlace>::Cache().At(n); compute_scal(&a, x, y, n); compute_sigmoid(y, y, n); compute_scal(&a, y, y, n); @@ -49,16 +51,12 @@ void VTanh(const T* x, T* y, int n) { } void Softmax(const T* x, T* y, int n, int bs) { - auto compute_hmax = - KernelFuncs, platform::CPUPlace>::Cache().At(n); - auto compute_hsum = - KernelFuncs, platform::CPUPlace>::Cache().At(n); - auto compute_vscal = - KernelFuncs, platform::CPUPlace>::Cache().At(n); + auto compute_hmax = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_hsum = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_vscal = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_vaddbias = - KernelFuncs, platform::CPUPlace>::Cache().At(n); - auto compute_vexp = - KernelFuncs, platform::CPUPlace>::Cache().At(n); + KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_vexp = KernelFuncs, CPUPlace>::Cache().At(n); for (int i = 0; i < bs; ++i) { T scalar; @@ -76,13 +74,13 @@ void Softmax(const T* x, T* y, int n, int bs) { void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT if (type == kVSigmoid) { - return Get, platform::CPUPlace>(d); + return KernelFuncs, CPUPlace>::Cache().At(d); } else if (type == kVRelu) { - return Get, platform::CPUPlace>(d); + return KernelFuncs, CPUPlace>::Cache().At(d); } else if (type == kVTanh) { - return Get, platform::CPUPlace>(d); + return KernelFuncs, CPUPlace>::Cache().At(d); } else if (type == kVIdentity) { - return Get, platform::CPUPlace>(d); + return KernelFuncs, CPUPlace>::Cache().At(d); } PADDLE_THROW("Not support type: %s", type); return nullptr; @@ -98,9 +96,9 @@ void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) { const int d = attr->d; const int d2 = d * 2; const int d3 = d * 3; - auto vmul_d = Get, platform::CPUPlace>(d); - auto vadd_d = Get, platform::CPUPlace>(d); - auto vadd_d2 = Get, platform::CPUPlace>(d2); + auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(d); + auto vadd_d = KernelFuncs, CPUPlace>::Cache().At(d); + auto vadd_d2 = KernelFuncs, CPUPlace>::Cache().At(d2); auto act_gate_d = getActFunc(attr->act_gate, d); auto act_gate_d2 = getActFunc(attr->act_gate, d2); auto act_gate_d3 = getActFunc(attr->act_gate, d3); @@ -140,8 +138,8 @@ void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { int d = attr->d; int d2 = d * 2; int d3 = d * 3; - auto vmul_d = Get, platform::CPUPlace>(d); - auto vadd_d = Get, platform::CPUPlace>(d); + auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(d); + auto vadd_d = KernelFuncs, CPUPlace>::Cache().At(d); auto act_gate_d = getActFunc(attr->act_gate, d); auto act_cand_d = getActFunc(attr->act_cand, d); auto act_cell_d = getActFunc(attr->act_cell, d); @@ -169,7 +167,7 @@ void GRUH1(gru_t* step, const gru_attr_t* attr) { int d2 = d * 2; auto act_gate = getActFunc(attr->act_gate, d); auto act_cand = getActFunc(attr->act_cand, d); - auto vmul_d = Get, platform::CPUPlace>(d); + auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(d); act_gate(gates, gates, d); act_cand(gates + d2, gates + d2, d); vmul_d(gates, gates + d2, ht, d); @@ -182,7 +180,7 @@ void GRUHtPart1(gru_t* step, const gru_attr_t* attr) { T* ht = reinterpret_cast(step->ht); const T* ht_1 = reinterpret_cast(step->ht_1); auto act_gate = getActFunc(attr->act_gate, attr->d); - auto vmul_d = Get, platform::CPUPlace>(attr->d); + auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(attr->d); act_gate(gates + attr->d, gates + attr->d, attr->d); vmul_d(ht_1, gates + attr->d, ht, attr->d); } @@ -206,21 +204,21 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { } // TODO(TJ): tuning me -bool VSigmoidKernel::UseMe(const int& d) const { return true; } +bool VSigmoidKernel::CanBeUsed(const int& d) const { return true; } -bool VTanhKernel::UseMe(const int& d) const { return true; } +bool VTanhKernel::CanBeUsed(const int& d) const { return true; } -bool SoftmaxKernel::UseMe(const int& d) const { return true; } +bool SoftmaxKernel::CanBeUsed(const int& d) const { return true; } -bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; } +bool LSTMCtHtKernel::CanBeUsed(const lstm_attr_t& attr) const { return true; } -bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; } +bool LSTMC1H1Kernel::CanBeUsed(const lstm_attr_t& attr) const { return true; } -bool GRUH1Kernel::UseMe(const gru_attr_t& attr) const { return true; } +bool GRUH1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; } -bool GRUHtPart1Kernel::UseMe(const gru_attr_t& attr) const { return true; } +bool GRUHtPart1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; } -bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; } +bool GRUHtPart2Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; } } // namespace mix } // namespace more @@ -230,16 +228,16 @@ bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; } namespace mix = paddle::operators::jit::more::mix; -#define REGISTER_MORE_KERNEL(key, func) \ - REGISTER_JITKERNEL_MORE(key, mix, mix::func##Kernel) - -REGISTER_MORE_KERNEL(kVSigmoid, VSigmoid); -REGISTER_MORE_KERNEL(kVTanh, VTanh); -REGISTER_MORE_KERNEL(kSoftmax, Softmax); -REGISTER_MORE_KERNEL(kLSTMCtHt, LSTMCtHt); -REGISTER_MORE_KERNEL(kLSTMC1H1, LSTMC1H1); -REGISTER_MORE_KERNEL(kGRUH1, GRUH1); -REGISTER_MORE_KERNEL(kGRUHtPart1, GRUHtPart1); -REGISTER_MORE_KERNEL(kGRUHtPart2, GRUHtPart2); +#define REGISTER_MORE_KERNEL(func) \ + REGISTER_JITKERNEL_MORE(k##func, mix, mix::func##Kernel) + +REGISTER_MORE_KERNEL(VSigmoid); +REGISTER_MORE_KERNEL(VTanh); +REGISTER_MORE_KERNEL(Softmax); +REGISTER_MORE_KERNEL(LSTMCtHt); +REGISTER_MORE_KERNEL(LSTMC1H1); +REGISTER_MORE_KERNEL(GRUH1); +REGISTER_MORE_KERNEL(GRUHtPart1); +REGISTER_MORE_KERNEL(GRUHtPart2); #undef REGISTER_MORE_KERNEL diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h index d64af192197a0b339a39a1862c028875da2f3900..994d485909c874a8a15418ad946c79a10265c748 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.h +++ b/paddle/fluid/operators/jit/more/mix/mix.h @@ -34,27 +34,27 @@ void GRUH1(gru_t* step, const gru_attr_t* attr); void GRUHtPart1(gru_t* step, const gru_attr_t* attr); void GRUHtPart2(gru_t* step, const gru_attr_t* attr); -#define DECLARE_MORE_KERNEL(name, tuples) \ - class name##Kernel : public KernelMore> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool UseMe(const typename tuples::attr_type&) const override; \ - const char* ImplType() const override { return "Mixed"; } \ +#define DECLARE_MORE_KERNEL(name) \ + class name##Kernel : public KernelMore> { \ + public: \ + name##Kernel() { this->func = name; } \ + bool CanBeUsed(const typename name##Tuple::attr_type&) const override; \ + const char* ImplType() const override { return "Mixed"; } \ } // XYN -DECLARE_MORE_KERNEL(VSigmoid, XYNTuples); -DECLARE_MORE_KERNEL(VTanh, XYNTuples); +DECLARE_MORE_KERNEL(VSigmoid); +DECLARE_MORE_KERNEL(VTanh); // XRN -DECLARE_MORE_KERNEL(Softmax, SoftmaxTuples); +DECLARE_MORE_KERNEL(Softmax); -DECLARE_MORE_KERNEL(LSTMCtHt, LSTMTuples); -DECLARE_MORE_KERNEL(LSTMC1H1, LSTMTuples); +DECLARE_MORE_KERNEL(LSTMCtHt); +DECLARE_MORE_KERNEL(LSTMC1H1); -DECLARE_MORE_KERNEL(GRUH1, GRUTuples); -DECLARE_MORE_KERNEL(GRUHtPart1, GRUTuples); -DECLARE_MORE_KERNEL(GRUHtPart2, GRUTuples); +DECLARE_MORE_KERNEL(GRUH1); +DECLARE_MORE_KERNEL(GRUHtPart1); +DECLARE_MORE_KERNEL(GRUHtPart2); #undef DECLARE_MORE_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 4f51353bce834325e6c659399a374e4fbc40d4b7..4f600b38144f53798e3d4c66264fc5bfa671a4f7 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -130,105 +130,106 @@ void ASum(const double* x, double* res, int n) { // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 template <> -bool VMulKernel::UseMe(const int& d) const { +bool VMulKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx512f) && d > 512; } template <> -bool VAddKernel::UseMe(const int& d) const { +bool VAddKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx) && d > 512; } template <> -bool VScalKernel::UseMe(const int& d) const { +bool VScalKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx512f) && d > 512; } template <> -bool VExpKernel::UseMe(const int& d) const { +bool VExpKernel::CanBeUsed(const int& d) const { return d > 7; } template <> -bool VSquareKernel::UseMe(const int& d) const { +bool VSquareKernel::CanBeUsed(const int& d) const { return d > 7; } template <> -bool VCopyKernel::UseMe(const int& d) const { +bool VCopyKernel::CanBeUsed(const int& d) const { return d > 15; } template <> -bool VBroadcastKernel::UseMe(const int64_t& d) const { +bool VBroadcastKernel::CanBeUsed(const int64_t& d) const { return d > 127; } template <> -bool VBroadcastKernel::UseMe(const int64_t& attr) const { +bool VBroadcastKernel::CanBeUsed(const int64_t& attr) const { return true; } template <> -bool VSigmoidKernel::UseMe(const int& d) const { +bool VSigmoidKernel::CanBeUsed(const int& d) const { return d > 7; } template <> -bool VTanhKernel::UseMe(const int& d) const { +bool VTanhKernel::CanBeUsed(const int& d) const { return d > 7; } template <> -bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { +bool SeqPoolKernel::CanBeUsed(const seq_pool_attr_t& attr) const { return true; } template <> -bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { +bool SeqPoolKernel::CanBeUsed(const seq_pool_attr_t& attr) const { return true; } template <> -bool EmbSeqPoolKernel::UseMe(const emb_seq_pool_attr_t& attr) const { +bool EmbSeqPoolKernel::CanBeUsed(const emb_seq_pool_attr_t& attr) const { return true; } template <> -bool EmbSeqPoolKernel::UseMe(const emb_seq_pool_attr_t& attr) const { +bool EmbSeqPoolKernel::CanBeUsed( + const emb_seq_pool_attr_t& attr) const { return true; } template <> -bool SgdKernel::UseMe(const sgd_attr_t& attr) const { +bool SgdKernel::CanBeUsed(const sgd_attr_t& attr) const { return true; } template <> -bool SgdKernel::UseMe(const sgd_attr_t& attr) const { +bool SgdKernel::CanBeUsed(const sgd_attr_t& attr) const { return true; } template <> -bool MatMulKernel::UseMe(const matmul_attr_t& attr) const { +bool MatMulKernel::CanBeUsed(const matmul_attr_t& attr) const { return platform::MayIUse(platform::avx); } template <> -bool MatMulKernel::UseMe(const matmul_attr_t& attr) const { +bool MatMulKernel::CanBeUsed(const matmul_attr_t& attr) const { return true; } template <> -bool SoftmaxKernel::UseMe(const int& d) const { +bool SoftmaxKernel::CanBeUsed(const int& d) const { // tuned on avx2 return platform::MayIUse(platform::avx) && d < 60; } -#define AWALYS_USE_ME_WITH_DOUBLE(func) \ - template <> \ - bool func##Kernel::UseMe(const int& d) const { \ - return true; \ +#define AWALYS_USE_ME_WITH_DOUBLE(func) \ + template <> \ + bool func##Kernel::CanBeUsed(const int& d) const { \ + return true; \ } AWALYS_USE_ME_WITH_DOUBLE(VMul); @@ -250,23 +251,23 @@ AWALYS_USE_ME_WITH_DOUBLE(Softmax); namespace mkl = paddle::operators::jit::more::mkl; -#define REGISTER_MKL_KERNEL(key, func) \ - REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel, \ +#define REGISTER_MKL_KERNEL(func) \ + REGISTER_JITKERNEL_MORE(k##func, mkl, mkl::func##Kernel, \ mkl::func##Kernel) -REGISTER_MKL_KERNEL(kMatMul, MatMul); -REGISTER_MKL_KERNEL(kVMul, VMul); -REGISTER_MKL_KERNEL(kVAdd, VAdd); -REGISTER_MKL_KERNEL(kVScal, VScal); -REGISTER_MKL_KERNEL(kVExp, VExp); -REGISTER_MKL_KERNEL(kVSquare, VSquare); -REGISTER_MKL_KERNEL(kVCopy, VCopy); -REGISTER_MKL_KERNEL(kVBroadcast, VBroadcast); -REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid); -REGISTER_MKL_KERNEL(kVTanh, VTanh); -REGISTER_MKL_KERNEL(kSeqPool, SeqPool); -REGISTER_MKL_KERNEL(kEmbSeqPool, EmbSeqPool); -REGISTER_MKL_KERNEL(kSoftmax, Softmax); -REGISTER_MKL_KERNEL(kSgd, Sgd); +REGISTER_MKL_KERNEL(MatMul); +REGISTER_MKL_KERNEL(VMul); +REGISTER_MKL_KERNEL(VAdd); +REGISTER_MKL_KERNEL(VScal); +REGISTER_MKL_KERNEL(VExp); +REGISTER_MKL_KERNEL(VSquare); +REGISTER_MKL_KERNEL(VCopy); +REGISTER_MKL_KERNEL(VBroadcast); +REGISTER_MKL_KERNEL(VSigmoid); +REGISTER_MKL_KERNEL(VTanh); +REGISTER_MKL_KERNEL(SeqPool); +REGISTER_MKL_KERNEL(EmbSeqPool); +REGISTER_MKL_KERNEL(Softmax); +REGISTER_MKL_KERNEL(Sgd); #undef REGISTER_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index db2d6faed4fdcfebedb9d9eb752831259af30186..f51dca654cd3d93dcd396af7895aebf5ee915c22 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -175,41 +175,38 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, } } -#define DECLARE_MKL_KERNEL(name, tuples) \ - template \ - class name##Kernel : public KernelMore> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool UseMe(const typename tuples::attr_type&) const override; \ - const char* ImplType() const override { return "MKL"; } \ +#define DECLARE_MKL_KERNEL(name) \ + template \ + class name##Kernel : public KernelMore> { \ + public: \ + name##Kernel() { this->func = name; } \ + bool CanBeUsed(const typename name##Tuple::attr_type&) const override; \ + const char* ImplType() const override { return "MKL"; } \ } // ABCMNK -DECLARE_MKL_KERNEL(MatMul, MatMulTuples); +DECLARE_MKL_KERNEL(MatMul); // XYZN -DECLARE_MKL_KERNEL(VMul, XYZNTuples); -DECLARE_MKL_KERNEL(VAdd, XYZNTuples); +DECLARE_MKL_KERNEL(VMul); +DECLARE_MKL_KERNEL(VAdd); // AXYN -DECLARE_MKL_KERNEL(VScal, AXYNTuples); +DECLARE_MKL_KERNEL(VScal); // XYN -DECLARE_MKL_KERNEL(VExp, XYNTuples); -DECLARE_MKL_KERNEL(VSigmoid, XYNTuples); -DECLARE_MKL_KERNEL(VTanh, XYNTuples); -DECLARE_MKL_KERNEL(VSquare, XYNTuples); -DECLARE_MKL_KERNEL(VCopy, XYNTuples); - -DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples); - -DECLARE_MKL_KERNEL(EmbSeqPool, EmbSeqPoolTuples); - -DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples); - -DECLARE_MKL_KERNEL(Sgd, SgdTuples); - -DECLARE_MKL_KERNEL(VBroadcast, VBroadcastTuples); +DECLARE_MKL_KERNEL(VExp); +DECLARE_MKL_KERNEL(VSigmoid); +DECLARE_MKL_KERNEL(VTanh); +DECLARE_MKL_KERNEL(VSquare); +DECLARE_MKL_KERNEL(VCopy); + +// others +DECLARE_MKL_KERNEL(SeqPool); +DECLARE_MKL_KERNEL(EmbSeqPool); +DECLARE_MKL_KERNEL(Softmax); +DECLARE_MKL_KERNEL(Sgd); +DECLARE_MKL_KERNEL(VBroadcast); #undef DECLARE_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index c279d1b2ca4f53bb6bc5da0cab41e9086ed475bd..0d1c4770903fc59160e308b958270e5826928d61 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -17,51 +17,43 @@ namespace refer = paddle::operators::jit::refer; -#define REGISTER_REFER_KERNEL(key, func) \ - REGISTER_JITKERNEL_REFER(key, refer::func##Kernel, \ +#define REGISTER_REFER_KERNEL(func) \ + REGISTER_JITKERNEL_REFER(k##func, refer::func##Kernel, \ refer::func##Kernel) -REGISTER_REFER_KERNEL(kVMul, VMul); -REGISTER_REFER_KERNEL(kVAdd, VAdd); -REGISTER_REFER_KERNEL(kVAddRelu, VAddRelu); -REGISTER_REFER_KERNEL(kVSub, VSub); - -REGISTER_REFER_KERNEL(kVScal, VScal); -REGISTER_REFER_KERNEL(kVAddBias, VAddBias); - -REGISTER_REFER_KERNEL(kVRelu, VRelu); -REGISTER_REFER_KERNEL(kVCopy, VCopy); -REGISTER_REFER_KERNEL(kVIdentity, VIdentity); -REGISTER_REFER_KERNEL(kVSquare, VSquare); -REGISTER_REFER_KERNEL(kVExp, VExp); -REGISTER_REFER_KERNEL(kVSigmoid, VSigmoid); -REGISTER_REFER_KERNEL(kVTanh, VTanh); - -REGISTER_REFER_KERNEL(kLSTMCtHt, LSTMCtHt); -REGISTER_REFER_KERNEL(kLSTMC1H1, LSTMC1H1); - -REGISTER_REFER_KERNEL(kGRUH1, GRUH1); -REGISTER_REFER_KERNEL(kGRUHtPart1, GRUHtPart1); -REGISTER_REFER_KERNEL(kGRUHtPart2, GRUHtPart2); - -REGISTER_REFER_KERNEL(kCRFDecoding, CRFDecoding); -REGISTER_REFER_KERNEL(kLayerNorm, LayerNorm); - -REGISTER_REFER_KERNEL(kNCHW16CMulNC, NCHW16CMulNC); - -REGISTER_REFER_KERNEL(kSeqPool, SeqPool); - -REGISTER_REFER_KERNEL(kMatMul, MatMul); - -REGISTER_REFER_KERNEL(kHMax, HMax); -REGISTER_REFER_KERNEL(kHSum, HSum); - -REGISTER_REFER_KERNEL(kSoftmax, Softmax); - -REGISTER_REFER_KERNEL(kEmbSeqPool, EmbSeqPool); - -REGISTER_REFER_KERNEL(kSgd, Sgd); - -REGISTER_REFER_KERNEL(kVBroadcast, VBroadcast); +REGISTER_REFER_KERNEL(VMul); +REGISTER_REFER_KERNEL(VAdd); +REGISTER_REFER_KERNEL(VAddRelu); +REGISTER_REFER_KERNEL(VSub); + +REGISTER_REFER_KERNEL(VScal); +REGISTER_REFER_KERNEL(VAddBias); + +REGISTER_REFER_KERNEL(VRelu); +REGISTER_REFER_KERNEL(VCopy); +REGISTER_REFER_KERNEL(VIdentity); +REGISTER_REFER_KERNEL(VSquare); +REGISTER_REFER_KERNEL(VExp); +REGISTER_REFER_KERNEL(VSigmoid); +REGISTER_REFER_KERNEL(VTanh); + +REGISTER_REFER_KERNEL(LSTMCtHt); +REGISTER_REFER_KERNEL(LSTMC1H1); + +REGISTER_REFER_KERNEL(GRUH1); +REGISTER_REFER_KERNEL(GRUHtPart1); +REGISTER_REFER_KERNEL(GRUHtPart2); + +REGISTER_REFER_KERNEL(CRFDecoding); +REGISTER_REFER_KERNEL(LayerNorm); +REGISTER_REFER_KERNEL(NCHW16CMulNC); +REGISTER_REFER_KERNEL(SeqPool); +REGISTER_REFER_KERNEL(MatMul); +REGISTER_REFER_KERNEL(HMax); +REGISTER_REFER_KERNEL(HSum); +REGISTER_REFER_KERNEL(Softmax); +REGISTER_REFER_KERNEL(EmbSeqPool); +REGISTER_REFER_KERNEL(Sgd); +REGISTER_REFER_KERNEL(VBroadcast); #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index b3b2097828c5b6d647fd6bfe14a6e8bff04409e0..cac705a484127b4813ef2d0996bf5aaee2b9f1b3 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -490,60 +490,54 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, } } -#define DECLARE_REFER_KERNEL(name, tuples) \ - template \ - class name##Kernel : public ReferKernel> { \ - public: \ - name##Kernel() { this->func = name; } \ +#define DECLARE_REFER_KERNEL(name) \ + template \ + class name##Kernel : public ReferKernel> { \ + public: \ + name##Kernel() { this->func = name; } \ } // const T* x, const T* y, T* z, int n -DECLARE_REFER_KERNEL(VMul, XYZNTuples); -DECLARE_REFER_KERNEL(VAdd, XYZNTuples); -DECLARE_REFER_KERNEL(VAddRelu, XYZNTuples); -DECLARE_REFER_KERNEL(VSub, XYZNTuples); +DECLARE_REFER_KERNEL(VMul); +DECLARE_REFER_KERNEL(VAdd); +DECLARE_REFER_KERNEL(VAddRelu); +DECLARE_REFER_KERNEL(VSub); // const T* a, const T* x, T* y, int n -DECLARE_REFER_KERNEL(VScal, AXYNTuples); -DECLARE_REFER_KERNEL(VAddBias, AXYNTuples); +DECLARE_REFER_KERNEL(VScal); +DECLARE_REFER_KERNEL(VAddBias); // const T* x, T* y, int n -DECLARE_REFER_KERNEL(VRelu, XYNTuples); -DECLARE_REFER_KERNEL(VIdentity, XYNTuples); -DECLARE_REFER_KERNEL(VExp, XYNTuples); -DECLARE_REFER_KERNEL(VSigmoid, XYNTuples); -DECLARE_REFER_KERNEL(VTanh, XYNTuples); -DECLARE_REFER_KERNEL(VSquare, XYNTuples); -DECLARE_REFER_KERNEL(VCopy, XYNTuples); +DECLARE_REFER_KERNEL(VRelu); +DECLARE_REFER_KERNEL(VIdentity); +DECLARE_REFER_KERNEL(VExp); +DECLARE_REFER_KERNEL(VSigmoid); +DECLARE_REFER_KERNEL(VTanh); +DECLARE_REFER_KERNEL(VSquare); +DECLARE_REFER_KERNEL(VCopy); // lstm_t*, const lstm_attr_t* -DECLARE_REFER_KERNEL(LSTMCtHt, LSTMTuples); -DECLARE_REFER_KERNEL(LSTMC1H1, LSTMTuples); +DECLARE_REFER_KERNEL(LSTMCtHt); +DECLARE_REFER_KERNEL(LSTMC1H1); // gru_t*, const gru_attr_t* -DECLARE_REFER_KERNEL(GRUH1, GRUTuples); -DECLARE_REFER_KERNEL(GRUHtPart1, GRUTuples); -DECLARE_REFER_KERNEL(GRUHtPart2, GRUTuples); - -DECLARE_REFER_KERNEL(CRFDecoding, CRFDecodingTuples); -DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples); - -DECLARE_REFER_KERNEL(NCHW16CMulNC, NCHW16CMulNCTuples); - -DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples); - -DECLARE_REFER_KERNEL(MatMul, MatMulTuples); - -DECLARE_REFER_KERNEL(HMax, XRNTuples); -DECLARE_REFER_KERNEL(HSum, XRNTuples); - -DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples); - -DECLARE_REFER_KERNEL(EmbSeqPool, EmbSeqPoolTuples); - -DECLARE_REFER_KERNEL(Sgd, SgdTuples); - -DECLARE_REFER_KERNEL(VBroadcast, VBroadcastTuples); +DECLARE_REFER_KERNEL(GRUH1); +DECLARE_REFER_KERNEL(GRUHtPart1); +DECLARE_REFER_KERNEL(GRUHtPart2); + +DECLARE_REFER_KERNEL(HMax); +DECLARE_REFER_KERNEL(HSum); + +// others +DECLARE_REFER_KERNEL(CRFDecoding); +DECLARE_REFER_KERNEL(LayerNorm); +DECLARE_REFER_KERNEL(NCHW16CMulNC); +DECLARE_REFER_KERNEL(SeqPool); +DECLARE_REFER_KERNEL(MatMul); +DECLARE_REFER_KERNEL(Softmax); +DECLARE_REFER_KERNEL(EmbSeqPool); +DECLARE_REFER_KERNEL(Sgd); +DECLARE_REFER_KERNEL(VBroadcast); #undef DECLARE_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/registry.h b/paddle/fluid/operators/jit/registry.h index cb32c487208fe8fe9e72c069db8833c736316aec..567a903236979ff4ac6095033f53d2a473f4eb2c 100644 --- a/paddle/fluid/operators/jit/registry.h +++ b/paddle/fluid/operators/jit/registry.h @@ -17,6 +17,7 @@ #include #include #include +#include // for std::move #include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/operators/jit/kernel_pool.h" #include "paddle/fluid/platform/place.h" @@ -49,8 +50,8 @@ struct JitKernelRegistrarFunctor { void operator()(KernelType kt) const { KernelKey kkey(kt, PlaceType()); - Pool().Instance().Insert(kkey, - std::move(make_unique())); + Pool::Instance().Insert(kkey, + std::move(make_unique())); constexpr auto size = std::tuple_size>::value; JitKernelRegistrarFunctor diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index cdec14dc4383897f4ae24fc89b99fe00c713cf42..6c099a7a062472e2701401ddc58bb9051074f810 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include #include #include #include @@ -64,413 +65,23 @@ std::vector TestSizes() { namespace jit = paddle::operators::jit; using CPUPlace = paddle::platform::CPUPlace; -template -struct TestFuncWithRefer { - void operator()(const typename KernelTuples::func_type tgt, Args... args) { - LOG(FATAL) << "Should specify this function."; - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - std::vector> { - void operator()(const typename jit::XYZNTuples::func_type tgt, - const std::vector& x, const std::vector& y, - const std::vector& zref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(zref.size(), x.size()); - EXPECT_EQ(zref.size(), y.size()); - const T* x_data = x.data(); - const T* y_data = y.data(); - const T* zref_data = zref.data(); - const int d = zref.size(); - - std::vector ztgt(d); - T* ztgt_data = ztgt.data(); - // test normal - tgt(x_data, y_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ztgt.begin()); - tgt(ztgt_data, y_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - // test inplace y - std::copy(y.begin(), y.end(), ztgt.begin()); - tgt(x_data, ztgt_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - } -}; - -template -struct TestFuncWithRefer, T, std::vector, - std::vector> { - void operator()(const typename jit::AXYNTuples::func_type tgt, const T a, - const std::vector& x, const std::vector& yref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - const int d = yref.size(); - std::vector ytgt(d); - T* ytgt_data = ytgt.data(); - // test normal - tgt(&a, x_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(&a, ytgt_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - int, int> { - void operator()(const typename jit::SoftmaxTuples::func_type tgt, - const std::vector& x, const std::vector& yref, int n, - int bs) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - EXPECT_EQ(x.size(), static_cast(n * bs)); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - std::vector ytgt(n * bs); - T* ytgt_data = ytgt.data(); - // test normal - tgt(x_data, ytgt_data, n, bs); - ExpectEQ(ytgt_data, yref_data, n * bs); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(ytgt_data, ytgt_data, n, bs); - ExpectEQ(ytgt_data, yref_data, n * bs); - } -}; - -template -struct TestFuncWithRefer, std::vector, T> { - void operator()(const typename jit::XRNTuples::func_type tgt, - const std::vector& x, const T ref_res) { - EXPECT_TRUE(tgt != nullptr); - T tgt_res; - tgt(x.data(), &tgt_res, x.size()); - ExpectEQ(&tgt_res, &ref_res, 1); - } -}; - -template -struct TestFuncWithRefer, std::vector, - std::vector, int64_t, - typename jit::VBroadcastTuples::attr_type> { - void operator()(const typename jit::VBroadcastTuples::func_type tgt, - const std::vector& x, const std::vector& yref, - int64_t h, - const typename jit::VBroadcastTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(x.size(), static_cast(attr)); - EXPECT_EQ(yref.size(), x.size() * h); - std::vector y(yref.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - T* y_data = y.data(); - tgt(x_data, y_data, h, attr); - ExpectEQ(y_data, yref_data, yref.size()); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector> { - void operator()(const typename jit::XYNTuples::func_type tgt, - const std::vector& x, const std::vector& yref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - const int d = yref.size(); - std::vector ytgt(d); - T* ytgt_data = ytgt.data(); - // test normal - tgt(x_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(ytgt_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - std::vector, std::vector, std::vector, - typename jit::LSTMTuples::attr_type> { - void operator()(const typename jit::LSTMTuples::func_type tgt, - const std::vector& xsrc, const std::vector& wp, - const std::vector& ct_1, const std::vector& ct_ref, - const std::vector& ht_ref, - const typename jit::LSTMTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(ct_ref.size(), ht_ref.size()); - EXPECT_EQ(ct_1.size(), ht_ref.size()); - EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); - EXPECT_EQ(wp.size(), 3 * ht_ref.size()); - - // x could be changed after compute, so copy to save src - int d = ht_ref.size(); - std::vector x(xsrc.size()), ct(ct_ref.size()), ht(ht_ref.size()); - std::vector checked(2 * d); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - - const T* ct_1_data = ct_1.data(); - const T* wp_data = wp.data(); - const T* ct_ref_data = ct_ref.data(); - const T* ht_ref_data = ht_ref.data(); - T* x_data = x.data(); - T* ct_data = ct.data(); - T* ht_data = ht.data(); - T* checked_data = checked.data(); - - jit::lstm_t step; - step.gates = x_data; - step.ct_1 = ct_1_data; - step.ct = ct_data; - step.ht = ht_data; - if (attr.use_peephole) { - step.wp = wp_data; - step.checked = checked_data; - } - - tgt(&step, &attr); - ExpectEQ(ct_data, ct_ref_data, d); - ExpectEQ(ht_data, ht_ref_data, d); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - std::vector, - typename jit::GRUTuples::attr_type> { - void operator()(const typename jit::GRUTuples::func_type tgt, - const std::vector& xsrc, const std::vector& ht_1, - const std::vector& ht_ref, - const typename jit::GRUTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(ht_1.size(), ht_ref.size()); - EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); - - // x could be changed after compute, so copy to save src - int d = ht_ref.size(); - std::vector x(xsrc.size()), ht(ht_ref.size()); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - const T* ht_1_data = ht_1.data(); - const T* ht_ref_data = ht_ref.data(); - T* x_data = x.data(); - T* ht_data = ht.data(); - jit::gru_t step; - step.gates = x_data; - step.ht_1 = ht_1_data; - step.ht = ht_data; - tgt(&step, &attr); - ExpectEQ(ht_data, ht_ref_data, d); - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - typename jit::SeqPoolTuples::attr_type> { - void operator()(const typename jit::SeqPoolTuples::func_type tgt, - const std::vector& x, const std::vector& yref, - const typename jit::SeqPoolTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(x.size() % yref.size(), static_cast(0)); - int w = yref.size(); - std::vector y(w); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - T* y_data = y.data(); - tgt(x_data, y_data, &attr); - ExpectEQ(y_data, yref_data, w); - } -}; - -template -struct TestFuncWithRefer, std::vector, - std::vector, std::vector, - typename jit::EmbSeqPoolTuples::attr_type> { - void operator()(const typename jit::EmbSeqPoolTuples::func_type tgt, - const std::vector& table, const std::vector& idx, - const std::vector& oref, - const typename jit::EmbSeqPoolTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(table.size(), - static_cast(attr.table_height * attr.table_width)); - EXPECT_EQ(idx.size(), - static_cast(attr.index_height * attr.index_width)); - EXPECT_EQ(oref.size(), - static_cast(attr.table_width * attr.index_width)); - const T* table_data = table.data(); - const int64_t* idx_data = idx.data(); - const T* oref_data = oref.data(); - int o_w = oref.size(); - std::vector out(o_w); - T* o_data = out.data(); - tgt(table_data, idx_data, o_data, &attr); - ExpectEQ(o_data, oref_data, o_w); - } -}; - -template -struct TestFuncWithRefer, T, std::vector, std::vector, - std::vector, std::vector, - typename jit::SgdTuples::attr_type> { - void operator()(const typename jit::SgdTuples::func_type tgt, const T lr, - const std::vector& param, const std::vector& grad, - const std::vector& rows, const std::vector& oref, - const typename jit::SgdTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(param.size(), - static_cast(attr.param_height * attr.param_width)); - EXPECT_EQ(grad.size(), - static_cast(attr.grad_height * attr.grad_width)); - EXPECT_EQ(rows.size(), static_cast(attr.selected_rows_size)); - EXPECT_EQ(param.size(), oref.size()); - const T* param_data = param.data(); - const T* grad_data = grad.data(); - const int64_t* rows_data = rows.data(); - const T* oref_data = oref.data(); - - std::vector out(oref.size()); - T* o_data = out.data(); - tgt(&lr, param_data, grad_data, rows_data, o_data, &attr); - // only the selected rows should be equal - for (size_t i = 0; i < rows.size(); ++i) { - ExpectEQ(o_data + rows[i] * attr.grad_width, - oref_data + rows[i] * attr.grad_width, attr.grad_width); - } - - // inplace - std::copy(param.begin(), param.end(), out.begin()); - tgt(&lr, o_data, grad_data, rows_data, o_data, &attr); - for (size_t i = 0; i < rows.size(); ++i) { - ExpectEQ(o_data + rows[i] * attr.grad_width, - oref_data + rows[i] * attr.grad_width, attr.grad_width); - } - } -}; - -template -struct TestFuncWithRefer, std::vector, std::vector, - std::vector, - typename jit::MatMulTuples::attr_type> { - void operator()(const typename jit::MatMulTuples::func_type tgt, - const std::vector& a, const std::vector& b, - const std::vector& cref, - const typename jit::MatMulTuples::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(a.size(), static_cast(attr.m * attr.k)); - EXPECT_EQ(b.size(), static_cast(attr.k * attr.n)); - EXPECT_EQ(cref.size(), static_cast(attr.m * attr.n)); - std::vector c(cref.size()); - const T* a_data = a.data(); - const T* b_data = b.data(); - const T* cref_data = cref.data(); - T* c_data = c.data(); - tgt(a_data, b_data, c_data, &attr); - ExpectEQ(c_data, cref_data, attr.m * attr.n); - } -}; - -template -struct TestFuncWithRefer, std::vector, - std::vector, std::vector, std::vector, - std::vector, std::vector, int, float, int> { - void operator()(const typename jit::LayerNormTuples::func_type tgt, - std::vector& x, std::vector& outref, // NOLINT - std::vector& mean, std::vector& var, // NOLINT - const std::vector& scale, const std::vector& bias, - int left, const float epsilon, int right) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(x.size(), static_cast(left * right)); - EXPECT_EQ(outref.size(), static_cast(left * right)); - EXPECT_EQ(mean.size(), static_cast(left)); - EXPECT_EQ(var.size(), static_cast(left)); - EXPECT_EQ(scale.size(), static_cast(right)); - EXPECT_EQ(bias.size(), static_cast(right)); - std::vector outtgt(outref.size()); - const T* scale_data = scale.data(); - const T* bias_data = bias.data(); - T* x_data = x.data(); - T* mean_data = mean.data(); - T* var_data = var.data(); - T* outref_data = outref.data(); - T* outtgt_data = outtgt.data(); - - tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, left, - epsilon, right); - ExpectEQ(outtgt_data, outref_data, left * right); - } -}; - -template -struct TestFuncWithRefer, int, std::vector, - std::vector, std::vector, std::vector, - int> { - void operator()(const typename jit::CRFDecodingTuples::func_type tgt, - const int seq_len, const std::vector& x, - const std::vector& w, std::vector& alpharef, // NOLINT - std::vector& trackref, int tag_num) { // NOLINT - constexpr int state_trans_base_idx = 2; - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(x.size(), static_cast(seq_len * tag_num)); - EXPECT_EQ(w.size(), - static_cast((tag_num + state_trans_base_idx) * tag_num)); - EXPECT_EQ(alpharef.size(), static_cast(seq_len * tag_num)); - EXPECT_EQ(trackref.size(), static_cast(seq_len * tag_num)); - std::vector alphatgt(alpharef.size()); - std::vector tracktgt(trackref.size()); - - memcpy(trackref.data(), tracktgt.data(), tag_num * sizeof(int)); - tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(), - tracktgt.data(), tag_num); - ExpectEQ(alpharef.data(), alphatgt.data(), seq_len * tag_num); - ExpectEQ(trackref.data(), tracktgt.data(), seq_len * tag_num); - } -}; - -template -void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { - TestFuncWithRefer test; - // test jitcode - auto jitcode = jit::GetJitCode(attr); - if (jitcode) { - VLOG(10) << "Test Jitcode Kernel "; - test(jitcode, args...); +void TestAllImpls(const typename KernelTuple::attr_type& attr, + const Tester& verifier, const Args&... args) { + auto funcs = jit::GetAllCandidateFuncsWithTypes(attr); + for (auto f : funcs) { + VLOG(10) << "Test Kernel " << f.first; + verifier(f.second, args...); } - // test all impls in more - jit::KernelKey kkey(KT, PlaceType()); - auto& pool = jit::KernelPool().Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); - if (i && i->UseMe(attr)) { - auto more = i->GetFunc(); - VLOG(10) << "Test More Kernel : " << i->ImplType(); - test(more, args...); - } - } - } - // test result from Get function - // VLOG(10) << "Test Get function "; - auto tgt = jit::Get(attr); - test(tgt, args...); } -template -void TestKernelXYZNTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelXYZN() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(d), y(d), zref(d); @@ -494,16 +105,42 @@ void TestKernelXYZNTuples() { ExpectEQ(xinp_data, zref_data, d); ExpectEQ(yinp_data, zref_data, d); - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(d, x, y, zref); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& y, + const std::vector& zref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(zref.size(), x.size()); + EXPECT_EQ(zref.size(), y.size()); + const T* x_data = x.data(); + const T* y_data = y.data(); + const T* zref_data = zref.data(); + const int d = zref.size(); + + std::vector ztgt(d); + T* ztgt_data = ztgt.data(); + // test normal + tgt(x_data, y_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ztgt.begin()); + tgt(ztgt_data, y_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + // test inplace y + std::copy(y.begin(), y.end(), ztgt.begin()); + tgt(x_data, ztgt_data, ztgt_data, d); + ExpectEQ(ztgt_data, zref_data, d); + }; + + TestAllImpls(d, verifier, x, y, zref); } } -template -void TestKernelAXYNTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelAXYN() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); const T a = static_cast(3); @@ -520,34 +157,33 @@ void TestKernelAXYNTuples() { ref(&a, xinp_data, xinp_data, d); ExpectEQ(xinp_data, yref_data, d); - TestAllImpls, PlaceType, T, std::vector, - std::vector>(d, a, x, yref); - } -} - -template -void TestKernelXRNTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - auto last_acc = FLAGS_acc; - FLAGS_acc = 1e-4; - for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - std::vector x(d); - RandomVec(d, x.data()); - T ref_res; - ref(x.data(), &ref_res, d); - TestAllImpls, PlaceType, std::vector, T>(d, x, - ref_res); + auto verifier = [](const typename KernelTuple::func_type tgt, const T a, + const std::vector& x, const std::vector& yref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(&a, x_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(&a, ytgt_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + }; + TestAllImpls(d, verifier, a, x, yref); } - FLAGS_acc = last_acc; } -template -void TestKernelXYNTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelXYN() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(d), yref(d); @@ -562,15 +198,57 @@ void TestKernelXYNTuples() { ref(x_data, yref_data, d); ref(xinp_data, xinp_data, d); ExpectEQ(xinp_data, yref_data, d); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(x_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(ytgt_data, ytgt_data, d); + ExpectEQ(ytgt_data, yref_data, d); + }; + TestAllImpls(d, verifier, x, yref); + } +} - TestAllImpls, PlaceType, std::vector, - std::vector>(d, x, yref); +template +void TestKernelXRN() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + auto last_acc = FLAGS_acc; + FLAGS_acc = 1e-4; + for (int d : TestSizes()) { + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + std::vector x(d); + RandomVec(d, x.data()); + T ref_res; + ref(x.data(), &ref_res, d); + + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const T ref_res) { + EXPECT_TRUE(tgt != nullptr); + T tgt_res; + tgt(x.data(), &tgt_res, x.size()); + ExpectEQ(&tgt_res, &ref_res, 1); + }; + TestAllImpls(d, verifier, x, ref_res); } + FLAGS_acc = last_acc; } -template -void TestKernelLSTMTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelLSTM() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; auto test_sizes = TestSizes(); test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); @@ -582,7 +260,7 @@ void TestKernelLSTMTuples() { const jit::lstm_attr_t attr( d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand), jit::to_kerneltype(act_cell), use_peephole); - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector xsrc(4 * d), wp(3 * d), ct_1(d); std::vector ct_ref(d), ht_ref(d), checked(2 * d); @@ -609,10 +287,51 @@ void TestKernelLSTMTuples() { } ref(&step, &attr); VLOG(10) << attr; - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector, std::vector, - std::vector>(attr, xsrc, wp, ct_1, ct_ref, ht_ref, - attr); + + auto verifier = []( + const typename KernelTuple::func_type tgt, + const std::vector& xsrc, const std::vector& wp, + const std::vector& ct_1, const std::vector& ct_ref, + const std::vector& ht_ref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ct_ref.size(), ht_ref.size()); + EXPECT_EQ(ct_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); + EXPECT_EQ(wp.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ct(ct_ref.size()), + ht(ht_ref.size()); + std::vector checked(2 * d); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + + const T* ct_1_data = ct_1.data(); + const T* wp_data = wp.data(); + const T* ct_ref_data = ct_ref.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ct_data = ct.data(); + T* ht_data = ht.data(); + T* checked_data = checked.data(); + + jit::lstm_t step; + step.gates = x_data; + step.ct_1 = ct_1_data; + step.ct = ct_data; + step.ht = ht_data; + if (attr.use_peephole) { + step.wp = wp_data; + step.checked = checked_data; + } + + tgt(&step, &attr); + ExpectEQ(ct_data, ct_ref_data, d); + ExpectEQ(ht_data, ht_ref_data, d); + }; + TestAllImpls(attr, verifier, xsrc, wp, ct_1, + ct_ref, ht_ref, attr); } } } @@ -620,9 +339,10 @@ void TestKernelLSTMTuples() { } } -template -void TestKernelGRUTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelGRU() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; auto test_sizes = TestSizes(); test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); @@ -631,7 +351,7 @@ void TestKernelGRUTuples() { for (auto& act_cand : all_acts) { const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand)); - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector xsrc(3 * d), ht_1(d), ht_ref(d); RandomVec(3 * d, xsrc.data()); @@ -648,17 +368,218 @@ void TestKernelGRUTuples() { step.ht = ht_ref_data; ref(&step, &attr); VLOG(10) << attr; - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(attr, xsrc, ht_1, ht_ref, - attr); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& xsrc, + const std::vector& ht_1, + const std::vector& ht_ref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(ht_1.size(), ht_ref.size()); + EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); + + // x could be changed after compute, so copy to save src + int d = ht_ref.size(); + std::vector x(xsrc.size()), ht(ht_ref.size()); + std::copy(xsrc.begin(), xsrc.end(), x.begin()); + const T* ht_1_data = ht_1.data(); + const T* ht_ref_data = ht_ref.data(); + T* x_data = x.data(); + T* ht_data = ht.data(); + jit::gru_t step; + step.gates = x_data; + step.ht_1 = ht_1_data; + step.ht = ht_data; + tgt(&step, &attr); + ExpectEQ(ht_data, ht_ref_data, d); + }; + TestAllImpls(attr, verifier, xsrc, ht_1, ht_ref, + attr); } } } } -template -void TestKernelSeqPoolTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelNCHW16CMulNC() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + const int n = 3, c = 16 * 4, h = 10, w = 10; + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + int sz = n * c * h * w; + std::vector x(sz), y(n * c), zref(sz); + std::vector ztgt(sz), zjit(sz); + RandomVec(sz, x.data()); + RandomVec(n * c, y.data()); + + const T* x_data = x.data(); + const T* y_data = y.data(); + T* zref_data = zref.data(); + T* ztgt_data = ztgt.data(); + T* zjit_data = zjit.data(); + constexpr int simd_width = ZMM_FLOAT_BLOCK; + int C = c / simd_width; + auto tgt = jit::KernelFuncs::Cache().At(0); + auto funcs = jit::GetAllCandidateFuncs(0); + EXPECT_GT(funcs.size(), 0UL); + auto jitcode = funcs[0]; + EXPECT_TRUE(tgt != nullptr); + + if (std::is_same::value && + paddle::platform::MayIUse(paddle::platform::avx512f)) { + EXPECT_TRUE(jitcode != nullptr); + } + for (int ni = 0; ni < n; ni++) { + for (int ci = 0; ci < C; ci++) { + auto ptr_x = + x_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + auto ptr_y = y_data + ni * C * simd_width + ci * simd_width; + auto ptr_zref = + zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + auto ptr_ztgt = + ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + + ref(ptr_x, ptr_y, ptr_zref, h, w); + tgt(ptr_x, ptr_y, ptr_ztgt, h, w); + + if (jitcode) { + auto ptr_zjit = + zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width; + jitcode(ptr_x, ptr_y, ptr_zjit, h, w); + } + } + } + ExpectEQ(ztgt_data, zref_data, sz); + if (jitcode) { + ExpectEQ(zjit_data, zref_data, sz); + } +} + +template +void TestKernelLayerNorm() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + const T epsilon = 9.99999975e-06; + for (int n : {1, 2, 10}) { + for (int x_dim_0 : {1, 9, 17, 50}) { + int left = n * x_dim_0; + for (int x_dim_1 : TestSizes()) { + int right = x_dim_1; + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + int sz = left * right; + std::vector x(sz), mean(left), var(left), scale(right), bias(right), + outref(sz); + RandomVec(sz, x.data()); + RandomVec(left, mean.data()); + RandomVec(left, var.data()); + RandomVec(right, scale.data()); + RandomVec(right, bias.data()); + + const T* scale_data = scale.data(); + const T* bias_data = bias.data(); + T* x_data = x.data(); + T* mean_data = mean.data(); + T* var_data = var.data(); + T* outref_data = outref.data(); + + ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data, + left, epsilon, right); + + auto verifier = []( + const typename KernelTuple::func_type tgt, const std::vector& x_, + const std::vector& outref_, const std::vector& mean_, + const std::vector& var_, const std::vector& scale, + const std::vector& bias, const int& left, const float& epsilon, + const typename KernelTuple::attr_type& right) { + EXPECT_TRUE(tgt != nullptr); + std::vector outtgt(outref_.size()); + std::vector x(x_.size()); + std::vector mean(mean_.size()); + std::vector var(var_.size()); + std::vector outref(outref_.size()); + std::copy(x_.begin(), x_.end(), x.begin()); + std::copy(mean_.begin(), mean_.end(), mean.begin()); + std::copy(var_.begin(), var_.end(), var.begin()); + std::copy(outref_.begin(), outref_.end(), outref.begin()); + + EXPECT_EQ(x.size(), static_cast(left * right)); + EXPECT_EQ(outref.size(), static_cast(left * right)); + EXPECT_EQ(mean.size(), static_cast(left)); + EXPECT_EQ(var.size(), static_cast(left)); + EXPECT_EQ(scale.size(), static_cast(right)); + EXPECT_EQ(bias.size(), static_cast(right)); + + const T* scale_data = scale.data(); + const T* bias_data = bias.data(); + T* x_data = x.data(); + T* mean_data = mean.data(); + T* var_data = var.data(); + T* outref_data = outref.data(); + T* outtgt_data = outtgt.data(); + tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, + left, epsilon, right); + ExpectEQ(outtgt_data, outref_data, left * right); + }; + TestAllImpls(right, verifier, x, outref, mean, + var, scale, bias, left, epsilon, + right); + } + } + } +} + +template +void TestKernelCRFDecoding() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + constexpr int state_trans_base_idx = 2; + auto test_sizes = TestSizes(); + test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000)); + for (int seq_len : {1, 11, 17, 50}) { + for (int tag_num : test_sizes) { + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + int x_sz = seq_len * tag_num; + int w_sz = (tag_num + state_trans_base_idx) * tag_num; + std::vector x(x_sz), w(w_sz), alpharef(x_sz); + std::vector trackref(x_sz); + RandomVec(x_sz, x.data()); + RandomVec(w_sz, w.data()); + + ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(), + trackref.data(), tag_num); + + auto verifier = []( + const typename KernelTuple::func_type tgt, const int& seq_len, + const std::vector& x, const std::vector& w, + const std::vector& alpharef, const std::vector& trackref, + const typename KernelTuple::attr_type& tag_num) { + constexpr int state_trans_base_idx = 2; + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(x.size(), static_cast(seq_len * tag_num)); + EXPECT_EQ(w.size(), static_cast( + (tag_num + state_trans_base_idx) * tag_num)); + EXPECT_EQ(alpharef.size(), static_cast(seq_len * tag_num)); + EXPECT_EQ(trackref.size(), static_cast(seq_len * tag_num)); + std::vector alphatgt(alpharef.size()); + std::vector tracktgt(trackref.size()); + memcpy(tracktgt.data(), trackref.data(), tag_num * sizeof(int)); + tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(), + tracktgt.data(), tag_num); + ExpectEQ(alpharef.data(), alphatgt.data(), seq_len * tag_num); + ExpectEQ(trackref.data(), tracktgt.data(), seq_len * tag_num); + }; + TestAllImpls(tag_num, verifier, seq_len, x, w, + alpharef, trackref, tag_num); + } + } +} + +template +void TestKernelSeqPool() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); std::vector pool_types = { jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; auto test_sizes = TestSizes(); @@ -668,7 +589,7 @@ void TestKernelSeqPoolTuples() { jit::seq_pool_attr_t attr(w, type); for (int h : test_sizes) { attr.h = h; - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(h * w), yref(w); RandomVec(h * w, x.data()); @@ -676,16 +597,86 @@ void TestKernelSeqPoolTuples() { T* yref_data = yref.data(); ref(x_data, yref_data, &attr); VLOG(10) << attr; - TestAllImpls, PlaceType, std::vector, - std::vector>(attr, x, yref, attr); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(x.size() % yref.size(), static_cast(0)); + int w = yref.size(); + std::vector y(w); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + T* y_data = y.data(); + tgt(x_data, y_data, &attr); + ExpectEQ(y_data, yref_data, w); + }; + TestAllImpls(attr, verifier, x, yref, attr); } } } } -template -void TestKernelMatMulTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelEmbSeqPool() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + int64_t tbl_h = 1e4; + std::vector pool_types = { + jit::SeqPoolType::kSum}; // only support sum yet + auto test_sizes = TestSizes(); + test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); + for (int tbl_w : test_sizes) { + std::vector table(tbl_h * tbl_w); + RandomVec(tbl_h * tbl_w, table.data()); + const T* table_data = table.data(); + for (auto type : pool_types) { + for (int idx_w : {1, 2, 10, 16}) { + for (int idx_h : {1, 2, 9, 13, 16}) { + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + std::vector idx(idx_h * idx_w); + RandomVec(idx_h * idx_w, idx.data(), 0, tbl_h - 1); + int64_t out_w = tbl_w * idx_w; + std::vector oref(out_w); + const int64_t* idx_data = idx.data(); + T* o_data = oref.data(); + jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w, + type); + ref(table_data, idx_data, o_data, &attr); + + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& table, + const std::vector& idx, + const std::vector& oref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(table.size(), static_cast(attr.table_height * + attr.table_width)); + EXPECT_EQ(idx.size(), static_cast(attr.index_height * + attr.index_width)); + EXPECT_EQ(oref.size(), + static_cast(attr.table_width * attr.index_width)); + const T* table_data = table.data(); + const int64_t* idx_data = idx.data(); + const T* oref_data = oref.data(); + int o_w = oref.size(); + std::vector out(o_w); + T* o_data = out.data(); + tgt(table_data, idx_data, o_data, &attr); + ExpectEQ(o_data, oref_data, o_w); + }; + TestAllImpls(attr, verifier, table, idx, oref, + attr); + } + } + } + } +} + +template +void TestKernelMatMul() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); auto last_acc = FLAGS_acc; // export MKL_CBWR=AVX would make MKL force to use AVX // export KMP_DETERMINISTIC_REDUCTION=yes would make the result deterministic @@ -693,7 +684,7 @@ void TestKernelMatMulTuples() { for (int m : {1, 2, 3, 4}) { for (int n : {1, 2, 3, 4}) { for (int k : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector a(m * k), b(k * n), c(m * n); RandomVec(m * k, a.data()); @@ -703,20 +694,36 @@ void TestKernelMatMulTuples() { T* c_data = c.data(); const jit::matmul_attr_t attr{m, n, k}; ref(a_data, b_data, c_data, &attr); - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(attr, a, b, c, attr); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& a, const std::vector& b, + const std::vector& cref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(a.size(), static_cast(attr.m * attr.k)); + EXPECT_EQ(b.size(), static_cast(attr.k * attr.n)); + EXPECT_EQ(cref.size(), static_cast(attr.m * attr.n)); + std::vector c(cref.size()); + const T* a_data = a.data(); + const T* b_data = b.data(); + const T* cref_data = cref.data(); + T* c_data = c.data(); + tgt(a_data, b_data, c_data, &attr); + ExpectEQ(c_data, cref_data, attr.m * attr.n); + }; + TestAllImpls(attr, verifier, a, b, c, attr); } } } FLAGS_acc = last_acc; } -template -void TestKernelSoftmaxTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelSoftmax() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int bs : {1, 2, 10}) { for (int n : TestSizes()) { - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(bs * n), y(bs * n); RandomVec(bs * n, x.data()); @@ -730,51 +737,33 @@ void TestKernelSoftmaxTuples() { ref(xinp_data, xinp_data, n, bs); ExpectEQ(xinp_data, y_data, n * bs); - TestAllImpls, PlaceType, std::vector, - std::vector>(n, x, y, n, bs); - } - } -} - -template -void TestKernelEmbSeqPoolTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - int64_t tbl_h = 1e4; - std::vector pool_types = { - jit::SeqPoolType::kSum}; // only support sum yet - auto test_sizes = TestSizes(); - test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); - for (int tbl_w : test_sizes) { - std::vector table(tbl_h * tbl_w); - RandomVec(tbl_h * tbl_w, table.data()); - const T* table_data = table.data(); - for (auto type : pool_types) { - for (int idx_w : {1, 2, 10, 16}) { - for (int idx_h : {1, 2, 9, 13, 16}) { - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - std::vector idx(idx_h * idx_w); - RandomVec(idx_h * idx_w, idx.data(), 0, tbl_h - 1); - int64_t out_w = tbl_w * idx_w; - std::vector oref(out_w); - const int64_t* idx_data = idx.data(); - T* o_data = oref.data(); - jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w, - type); - ref(table_data, idx_data, o_data, &attr); - - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector>(attr, table, idx, - oref, attr); - } - } + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref, + int n, int bs) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + EXPECT_EQ(x.size(), static_cast(n * bs)); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + std::vector ytgt(n * bs); + T* ytgt_data = ytgt.data(); + // test normal + tgt(x_data, ytgt_data, n, bs); + ExpectEQ(ytgt_data, yref_data, n * bs); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(ytgt_data, ytgt_data, n, bs); + ExpectEQ(ytgt_data, yref_data, n * bs); + }; + TestAllImpls(n, verifier, x, y, n, bs); } } } -template -void TestKernelSgdTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); +template +void TestKernelSgd() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); const T lr = 0.1; auto UnDuplicatedRandomVec = [](int n, const int64_t lower, const int64_t upper) -> std::vector { @@ -802,7 +791,7 @@ void TestKernelSgdTuples() { RandomVec(rows_size * grad_w, grad.data()); const int64_t* rows_data = rows.data(); const T* grad_data = grad.data(); - auto ref = jit::GetRefer>(); + auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size); ref(&lr, param_data, grad_data, rows_data, out_data, &attr); @@ -818,227 +807,488 @@ void TestKernelSgdTuples() { grad_w); } - TestAllImpls, PlaceType, T, std::vector, - std::vector, std::vector, std::vector>( - attr, lr, param, grad, rows, param_out, attr); + auto verifier = []( + const typename KernelTuple::func_type tgt, const T lr, + const std::vector& param, const std::vector& grad, + const std::vector& rows, const std::vector& oref, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(param.size(), + static_cast(attr.param_height * attr.param_width)); + EXPECT_EQ(grad.size(), + static_cast(attr.grad_height * attr.grad_width)); + EXPECT_EQ(rows.size(), static_cast(attr.selected_rows_size)); + EXPECT_EQ(param.size(), oref.size()); + const T* param_data = param.data(); + const T* grad_data = grad.data(); + const int64_t* rows_data = rows.data(); + const T* oref_data = oref.data(); + + std::vector out(oref.size()); + T* o_data = out.data(); + tgt(&lr, param_data, grad_data, rows_data, o_data, &attr); + // only the selected rows should be equal + for (size_t i = 0; i < rows.size(); ++i) { + ExpectEQ(o_data + rows[i] * attr.grad_width, + oref_data + rows[i] * attr.grad_width, attr.grad_width); + } + + // inplace + std::copy(param.begin(), param.end(), out.begin()); + tgt(&lr, o_data, grad_data, rows_data, o_data, &attr); + for (size_t i = 0; i < rows.size(); ++i) { + ExpectEQ(o_data + rows[i] * attr.grad_width, + oref_data + rows[i] * attr.grad_width, attr.grad_width); + } + }; + TestAllImpls(attr, verifier, lr, param, grad, + rows, param_out, attr); } } } } -template -void TestKernelNCHW16CMulNCTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - const int n = 3, c = 16 * 4, h = 10, w = 10; - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - int sz = n * c * h * w; - std::vector x(sz), y(n * c), zref(sz); - std::vector ztgt(sz), zjit(sz); - RandomVec(sz, x.data()); - RandomVec(n * c, y.data()); - - const T* x_data = x.data(); - const T* y_data = y.data(); - T* zref_data = zref.data(); - T* ztgt_data = ztgt.data(); - T* zjit_data = zjit.data(); - constexpr int simd_width = ZMM_FLOAT_BLOCK; - int C = c / simd_width; - auto tgt = jit::Get, PlaceType>(0); - auto jitcode = jit::GetJitCode, PlaceType>(0); - EXPECT_TRUE(tgt != nullptr); - - if (std::is_same::value && - paddle::platform::MayIUse(paddle::platform::avx512f)) { - EXPECT_TRUE(jitcode != nullptr); - } - for (int ni = 0; ni < n; ni++) { - for (int ci = 0; ci < C; ci++) { - auto ptr_x = - x_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - auto ptr_y = y_data + ni * C * simd_width + ci * simd_width; - auto ptr_zref = - zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - auto ptr_ztgt = - ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - - ref(ptr_x, ptr_y, ptr_zref, h, w); - tgt(ptr_x, ptr_y, ptr_ztgt, h, w); +template +void TestKernelVBroadcast() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + for (int w : TestSizes()) { + std::vector x(w); + RandomVec(w, x.data()); + const T* x_data = x.data(); + for (int64_t h : {1, 2, 6}) { + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + std::vector y(w * h); + T* y_data = y.data(); + ref(x_data, y_data, h, w); - if (jitcode) { - auto ptr_zjit = - zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - jitcode(ptr_x, ptr_y, ptr_zjit, h, w); - } + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref, + const int64_t& h, + const typename KernelTuple::attr_type& attr) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(x.size(), static_cast(attr)); + EXPECT_EQ(yref.size(), x.size() * h); + std::vector y(yref.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + T* y_data = y.data(); + tgt(x_data, y_data, h, attr); + ExpectEQ(y_data, yref_data, yref.size()); + }; + TestAllImpls(static_cast(w), verifier, x, + y, h, static_cast(w)); } } - ExpectEQ(ztgt_data, zref_data, sz); - if (jitcode) { - ExpectEQ(zjit_data, zref_data, sz); - } } -template -void TestKernelLayerNormTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - const T epsilon = 9.99999975e-06; - for (int n : {1, 2, 10}) { - for (int x_dim_0 : {1, 9, 17, 50}) { - int left = n * x_dim_0; - for (int x_dim_1 : TestSizes()) { - int right = x_dim_1; - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - int sz = left * right; - std::vector x(sz), mean(left), var(left), scale(right), bias(right), - outref(sz); - RandomVec(sz, x.data()); - RandomVec(left, mean.data()); - RandomVec(left, var.data()); - RandomVec(right, scale.data()); - RandomVec(right, bias.data()); - - const T* scale_data = scale.data(); - const T* bias_data = bias.data(); - T* x_data = x.data(); - T* mean_data = mean.data(); - T* var_data = var.data(); - T* outref_data = outref.data(); +// test pool +TEST(JITKernel_pool, jitcreator) { + const auto& jitcreators = jit::JitCodeCreatorPool::Instance().AllCreators(); +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_EQ(jitcreators.size(), 0UL); +#else + EXPECT_EQ(jitcreators.size(), 25UL); +#endif +} - ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data, - left, epsilon, right); +TEST(JITKernel_pool, jitpool) { + // jitpool is related with attr + const auto& kers = jit::JitCodePool().Instance().AllKernels(); + EXPECT_EQ(kers.size(), 0UL); + jit::GetAllCandidateKernels, CPUPlace>(3); +// after call GetAllCandidateKernels, it will create jitcode Automatically +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_EQ(kers.size(), 0UL); +#else + EXPECT_EQ(kers.size(), 1UL); +#endif +} - TestAllImpls, PlaceType, std::vector, - std::vector, std::vector, std::vector, - std::vector, std::vector, int, float>( - right, x, outref, mean, var, scale, bias, left, epsilon, right); - } - } - } +TEST(JITKernel_pool, more) { + const auto& kers = jit::KernelPool::Instance().AllKernels(); +#if defined(__APPLE__) || defined(__OSX__) + EXPECT_EQ(kers.size(), 10UL); +#else +#ifdef PADDLE_WITH_MKLML + EXPECT_EQ(kers.size(), 21UL); +#else + EXPECT_EQ(kers.size(), 8UL); +#endif +#endif } -template -void TestKernelCRFDecodingTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - constexpr int state_trans_base_idx = 2; - auto test_sizes = TestSizes(); - test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000)); - for (int seq_len : {1, 11, 17, 50}) { - for (int tag_num : test_sizes) { - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - int x_sz = seq_len * tag_num; - int w_sz = (tag_num + state_trans_base_idx) * tag_num; - std::vector x(x_sz), w(w_sz), alpharef(x_sz); - std::vector trackref(x_sz); - RandomVec(x_sz, x.data()); - RandomVec(w_sz, w.data()); +TEST(JITKernel_pool, refer) { + const auto& kers = jit::ReferKernelPool::Instance().AllKernels(); + EXPECT_EQ(kers.size(), 29UL); +} - ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(), - trackref.data(), tag_num); +// test helper +TEST(JITKernel_helper, GetAllCandidateKernels) { + auto fp_kers = + jit::GetAllCandidateKernels, CPUPlace>(10); +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_GE(fp_kers.size(), 1UL); // refer +#else +#ifdef PADDLE_WITH_MKLML + EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer +#else + EXPECT_GE(fp_kers.size(), 2UL); // jitcode, refer +#endif +#endif + + auto db_kers = + jit::GetAllCandidateKernels, CPUPlace>(10); +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_GE(db_kers.size(), 1UL); // refer +#else +#ifdef PADDLE_WITH_MKLML + EXPECT_GE(db_kers.size(), 2UL); // mkl, refer +#else + EXPECT_GE(db_kers.size(), 1UL); // refer +#endif +#endif +} - TestAllImpls, PlaceType, int, - std::vector, std::vector, std::vector, - std::vector, int>(tag_num, seq_len, x, w, alpharef, - trackref, tag_num); - } - } +TEST(JITKernel_helper, GetAllCandidateFuncsWithTypes) { + auto fp_kers = + jit::GetAllCandidateFuncsWithTypes, CPUPlace>(10); +#if defined(__APPLE__) || defined(__OSX__) + EXPECT_GE(fp_kers.size(), 1UL); // refer +#else +#if !defined(PADDLE_WITH_MKLML) || defined(_WIN32) + EXPECT_GE(fp_kers.size(), 2UL); // jitcode/mkl, refer +#else + EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer +#endif +#endif + + auto db_kers = + jit::GetAllCandidateFuncsWithTypes, CPUPlace>(10); +#if defined(__APPLE__) || defined(__OSX__) || !defined(PADDLE_WITH_MKLML) + EXPECT_GE(db_kers.size(), 1UL); // refer +#else + EXPECT_GE(db_kers.size(), 2UL); // mkl, refer +#endif } -template -void TestKernelVBroadcastTuples() { - VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); - for (int w : TestSizes()) { - std::vector x(w); - RandomVec(w, x.data()); - const T* x_data = x.data(); - for (int64_t h : {1, 2, 6}) { - auto ref = jit::GetRefer>(); - EXPECT_TRUE(ref != nullptr); - std::vector y(w * h); - T* y_data = y.data(); - ref(x_data, y_data, h, w); +TEST(JITKernel_helper, KernelFuncs) { + auto f1 = jit::KernelFuncs, CPUPlace>::Cache().At(3); + auto f2 = jit::KernelFuncs, CPUPlace>::Cache()[3]; + EXPECT_TRUE(f1 != nullptr); + EXPECT_TRUE(f1 == f2); + + auto f3 = jit::KernelFuncs, CPUPlace>::Cache()[5]; +#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) + EXPECT_TRUE(f2 == f3); +#else + EXPECT_TRUE(f2 != f3); +#endif +} - TestAllImpls, PlaceType, std::vector, - std::vector, int64_t>(static_cast(w), x, y, h, - static_cast(w)); - } +TEST(JITKernel_helper, GetAllCandidateFuncs) { + auto funcs = jit::GetAllCandidateFuncs, CPUPlace>(10); + auto kers = jit::GetAllCandidateKernels, CPUPlace>(10); + EXPECT_EQ(funcs.size(), kers.size()); + + std::vector x(10), tgt(10); + RandomVec(10, x.data()); + auto best = jit::GetDefaultBestFunc, CPUPlace>(10); + best(x.data(), tgt.data(), 10); + for (auto f : funcs) { + std::vector y(10); + f(x.data(), y.data(), 10); + ExpectEQ(y.data(), tgt.data(), 10); } } -#define TEST_CPU_KERNEL(test_tuple, kernel_type) \ - TEST(JITKernel, kernel_type) { \ - TestKernel##test_tuple(); \ - TestKernel##test_tuple(); \ +TEST(JITKernel_helper, pack_weights) { + const int N = 8 * 60, K = 2; + float src[K][N], yref[K][N], y[K * N]; + float* x = &(src[0][0]); + float* ref = &(yref[0][0]); + for (int i = 0; i < N * K; ++i) { + *(x + i) = static_cast(i); + } + int block = 0; + std::vector groups; + if (paddle::platform::MayIUse(paddle::platform::avx512f)) { + block = ZMM_FLOAT_BLOCK; + groups.push_back(30); + } else { + block = YMM_FLOAT_BLOCK; + groups.insert(groups.end(), {14, 14, 14, 14, 4}); } -TEST_CPU_KERNEL(XYZNTuples, kVMul); -TEST_CPU_KERNEL(XYZNTuples, kVAdd); -TEST_CPU_KERNEL(XYZNTuples, kVAddRelu); -TEST_CPU_KERNEL(XYZNTuples, kVSub); - -TEST_CPU_KERNEL(AXYNTuples, kVScal); -TEST_CPU_KERNEL(AXYNTuples, kVAddBias); + int offset = 0; + int acc = 0; + for (int g : groups) { + g = g * block; + for (int k = 0; k < K; ++k) { + for (int i = 0; i < g; ++i) { + *(ref + offset) = src[k][i + acc]; + offset++; + } + } + acc += g; + } -TEST_CPU_KERNEL(XRNTuples, kHMax); -TEST_CPU_KERNEL(XRNTuples, kHSum); + jit::pack_weights(x, y, N, K); + ExpectEQ(y, ref, N * K); +} -TEST_CPU_KERNEL(XYNTuples, kVRelu); -TEST_CPU_KERNEL(XYNTuples, kVIdentity); -TEST_CPU_KERNEL(XYNTuples, kVSquare); -TEST_CPU_KERNEL(XYNTuples, kVExp); -TEST_CPU_KERNEL(XYNTuples, kVSigmoid); -TEST_CPU_KERNEL(XYNTuples, kVTanh); -TEST_CPU_KERNEL(XYNTuples, kVCopy); +TEST(JITKernel_helper, attr) { + std::ostringstream out; + // KernelTypes + out << jit::to_string(jit::kNone) << jit::to_string(jit::kCRFDecoding) + << jit::to_string(jit::kEmbSeqPool) << jit::to_string(jit::kGRUH1) + << jit::to_string(jit::kGRUHtPart1) << jit::to_string(jit::kGRUHtPart2) + << jit::to_string(jit::kHSum) << jit::to_string(jit::kHMax) + << jit::to_string(jit::kLSTMCtHt) << jit::to_string(jit::kLSTMC1H1) + << jit::to_string(jit::kLayerNorm) << jit::to_string(jit::kMatMul) + << jit::to_string(jit::kNCHW16CMulNC) << jit::to_string(jit::kSeqPool) + << jit::to_string(jit::kSoftmax) << jit::to_string(jit::kVAdd) + << jit::to_string(jit::kVAddBias) << jit::to_string(jit::kVAddRelu) + << jit::to_string(jit::kVBroadcast) << jit::to_string(jit::kVCopy) + << jit::to_string(jit::kVExp) << jit::to_string(jit::kVIdentity) + << jit::to_string(jit::kVMul) << jit::to_string(jit::kVRelu) + << jit::to_string(jit::kVScal) << jit::to_string(jit::kSgd) + << jit::to_string(jit::kVSigmoid) << jit::to_string(jit::kVSquare) + << jit::to_string(jit::kVSub) << jit::to_string(jit::kVTanh); + EXPECT_EQ(out.str().size(), 234); + + // SeqPoolTypes + out.str(""); + out << jit::to_string(jit::kSum) << jit::to_string(jit::kAvg) + << jit::to_string(jit::kSqrt); + EXPECT_EQ(out.str().size(), 13); + + EXPECT_EQ(jit::to_kerneltype("relu"), jit::kVRelu); + EXPECT_EQ(jit::to_kerneltype("Identity"), jit::kVIdentity); + EXPECT_EQ(jit::to_kerneltype("VEXP"), jit::kVExp); + EXPECT_EQ(jit::to_kerneltype("SigmoiD"), jit::kVSigmoid); + EXPECT_EQ(jit::to_kerneltype("VTanh"), jit::kVTanh); + + out.str(""); + out << jit::lstm_attr_t(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); + EXPECT_EQ(out.str().size(), 89); + + out.str(""); + out << jit::gru_attr_t(8, jit::kVIdentity, jit::kVSigmoid); + EXPECT_EQ(out.str().size(), 52); + + out.str(""); + out << jit::seq_pool_attr_t(8, jit::SeqPoolType::kSum); + EXPECT_EQ(out.str().size(), 44); + + out.str(""); + out << jit::emb_seq_pool_attr_t(1, 2, 3, 4, 5, jit::SeqPoolType::kAvg); + EXPECT_EQ(out.str().size(), 93); + + out.str(""); + out << jit::sgd_attr_t(1, 2, 3, 4, 5); + EXPECT_EQ(out.str().size(), 81); + + out.str(""); + out << jit::matmul_attr_t(1, 2, 3); + EXPECT_EQ(out.str().size(), 14); +} -TEST_CPU_KERNEL(LSTMTuples, kLSTMCtHt); -TEST_CPU_KERNEL(LSTMTuples, kLSTMC1H1); +// test keys +TEST(JITKernel_key, int) { + EXPECT_TRUE(jit::JitCodeKey(2) == jit::JitCodeKey(2)); + EXPECT_TRUE(jit::JitCodeKey(2) == jit::JitCodeKey(2)); + EXPECT_TRUE(jit::JitCodeKey(2) != jit::JitCodeKey(3)); +} -TEST_CPU_KERNEL(GRUTuples, kGRUH1); -TEST_CPU_KERNEL(GRUTuples, kGRUHtPart1); -TEST_CPU_KERNEL(GRUTuples, kGRUHtPart2); +TEST(JITKernel_key, gru) { + jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh); + jit::gru_attr_t attr2(8, jit::kVSigmoid, jit::kVTanh); + jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh); + jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity); + jit::gru_attr_t attr5(9, jit::kVTanh, jit::kVIdentity); -TEST_CPU_KERNEL(NCHW16CMulNCTuples, kNCHW16CMulNC); + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + auto key5 = jit::JitCodeKey(attr5); -TEST_CPU_KERNEL(SeqPoolTuples, kSeqPool); -TEST_CPU_KERNEL(MatMulTuples, kMatMul); -TEST_CPU_KERNEL(SoftmaxTuples, kSoftmax); -TEST_CPU_KERNEL(EmbSeqPoolTuples, kEmbSeqPool); -TEST_CPU_KERNEL(SgdTuples, kSgd); -TEST_CPU_KERNEL(LayerNormTuples, kLayerNorm); -TEST_CPU_KERNEL(CRFDecodingTuples, kCRFDecoding); -TEST_CPU_KERNEL(VBroadcastTuples, kVBroadcast); + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 != key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key2 != key5); + EXPECT_TRUE(key3 != key4); + EXPECT_TRUE(key3 != key5); + EXPECT_TRUE(key4 != key5); +} TEST(JITKernel_key, lstm) { jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); - jit::lstm_attr_t attr2(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); + jit::lstm_attr_t attr2(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); jit::lstm_attr_t attr3(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); jit::lstm_attr_t attr4(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh); + jit::lstm_attr_t attr5(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true); + jit::lstm_attr_t attr6(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true); auto key1 = jit::JitCodeKey(attr1); auto key2 = jit::JitCodeKey(attr2); auto key3 = jit::JitCodeKey(attr3); auto key4 = jit::JitCodeKey(attr4); + auto key5 = jit::JitCodeKey(attr5); + auto key6 = jit::JitCodeKey(attr6); - EXPECT_TRUE(key1 != key2); - EXPECT_TRUE(key2 == key3); + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 != key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key2 != key5); EXPECT_TRUE(key3 != key4); + EXPECT_TRUE(key3 != key5); + EXPECT_TRUE(key4 != key5); + EXPECT_TRUE(key5 == key6); } -TEST(JITKernel_key, gru) { - jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh); - jit::gru_attr_t attr2(9, jit::kVSigmoid, jit::kVTanh); - jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh); - jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity); +TEST(JITKernel_key, seq_pool) { + jit::seq_pool_attr_t attr1(2, jit::SeqPoolType::kSum, 1); + jit::seq_pool_attr_t attr2(2, jit::SeqPoolType::kSum, 3); + jit::seq_pool_attr_t attr3(3, jit::SeqPoolType::kSum, 3); + jit::seq_pool_attr_t attr4(3, jit::SeqPoolType::kAvg, 3); - auto key1 = jit::JitCodeKey(attr1); - auto key2 = jit::JitCodeKey(attr2); - auto key3 = jit::JitCodeKey(attr3); - auto key4 = jit::JitCodeKey(attr4); + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); - EXPECT_TRUE(key1 != key2); + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 != key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key3 != key4); +} + +TEST(JITKernel_key, matmul) { + jit::matmul_attr_t attr1(1, 2, 3); + jit::matmul_attr_t attr2(1, 2, 3); + jit::matmul_attr_t attr3(1, 3, 3); + jit::matmul_attr_t attr4(2, 3, 4); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 != key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key3 != key4); +} + +TEST(JITKernel_key, emb_seq_pool) { + jit::emb_seq_pool_attr_t attr1(1, 2, 3, 4, 5, jit::SeqPoolType::kSum); + jit::emb_seq_pool_attr_t attr2(1, 2, 3, 4, 5, jit::SeqPoolType::kSum); + jit::emb_seq_pool_attr_t attr3(10, 2, 9, 8, 7, jit::SeqPoolType::kAvg); + jit::emb_seq_pool_attr_t attr4(10, 3, 9, 8, 7, jit::SeqPoolType::kSum); + jit::emb_seq_pool_attr_t attr5(1, 6, 3, 4, 5, jit::SeqPoolType::kSum); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + auto key5 = jit::JitCodeKey(attr5); + + EXPECT_TRUE(key1 == key2); + EXPECT_TRUE(key2 == key3); + EXPECT_TRUE(key2 != key4); + EXPECT_TRUE(key2 != key5); + EXPECT_TRUE(key4 != key5); +} + +TEST(JITKernel_key, sgd) { + jit::sgd_attr_t attr1(1, 2, 3, 4, 5); + jit::sgd_attr_t attr2(1, 2, 3, 4, 5); + jit::sgd_attr_t attr3(9, 8, 7, 4, 6); + jit::sgd_attr_t attr4(1, 2, 3, 6, 5); + jit::sgd_attr_t attr5(10, 9, 8, 7, 6); + + auto key1 = jit::JitCodeKey(attr1); + auto key2 = jit::JitCodeKey(attr2); + auto key3 = jit::JitCodeKey(attr3); + auto key4 = jit::JitCodeKey(attr4); + auto key5 = jit::JitCodeKey(attr5); + + EXPECT_TRUE(key1 == key2); EXPECT_TRUE(key2 == key3); EXPECT_TRUE(key3 != key4); + EXPECT_TRUE(key3 != key5); + EXPECT_TRUE(key4 != key5); } -// TODO(TJ): add more test about key and pool + +// test kernerls +#define TestKernelVMul TestKernelXYZN +#define TestKernelVAdd TestKernelXYZN +#define TestKernelVAddRelu TestKernelXYZN +#define TestKernelVSub TestKernelXYZN + +#define TestKernelVScal TestKernelAXYN +#define TestKernelVAddBias TestKernelAXYN + +#define TestKernelVRelu TestKernelXYN +#define TestKernelVIdentity TestKernelXYN +#define TestKernelVSquare TestKernelXYN +#define TestKernelVExp TestKernelXYN +#define TestKernelVSigmoid TestKernelXYN +#define TestKernelVTanh TestKernelXYN +#define TestKernelVCopy TestKernelXYN + +#define TestKernelHMax TestKernelXRN +#define TestKernelHSum TestKernelXRN + +#define TestKernelLSTMCtHt TestKernelLSTM +#define TestKernelLSTMC1H1 TestKernelLSTM + +#define TestKernelGRUH1 TestKernelGRU +#define TestKernelGRUHtPart1 TestKernelGRU +#define TestKernelGRUHtPart2 TestKernelGRU + +#define TEST_CPU_KERNEL(kernel_type) \ + TEST(JITKernel, kernel_type) { \ + TestKernel##kernel_type, CPUPlace>(); \ + TestKernel##kernel_type, CPUPlace>(); \ + } + +TEST_CPU_KERNEL(VMul); +TEST_CPU_KERNEL(VAdd); +TEST_CPU_KERNEL(VAddRelu); +TEST_CPU_KERNEL(VSub); + +TEST_CPU_KERNEL(VScal); +TEST_CPU_KERNEL(VAddBias); + +TEST_CPU_KERNEL(VRelu); +TEST_CPU_KERNEL(VIdentity); +TEST_CPU_KERNEL(VSquare); +TEST_CPU_KERNEL(VExp); +TEST_CPU_KERNEL(VSigmoid); +TEST_CPU_KERNEL(VTanh); +TEST_CPU_KERNEL(VCopy); + +TEST_CPU_KERNEL(HMax); +TEST_CPU_KERNEL(HSum); + +TEST_CPU_KERNEL(LSTMCtHt); +TEST_CPU_KERNEL(LSTMC1H1); + +TEST_CPU_KERNEL(GRUH1); +TEST_CPU_KERNEL(GRUHtPart1); +TEST_CPU_KERNEL(GRUHtPart2); + +TEST_CPU_KERNEL(NCHW16CMulNC); +TEST_CPU_KERNEL(LayerNorm); +TEST_CPU_KERNEL(CRFDecoding); + +TEST_CPU_KERNEL(SeqPool); +TEST_CPU_KERNEL(EmbSeqPool); +TEST_CPU_KERNEL(MatMul); +TEST_CPU_KERNEL(Softmax); +TEST_CPU_KERNEL(Sgd); +TEST_CPU_KERNEL(VBroadcast); diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc index b9db6daf0825b573bfc7f684266212f998c91627..9b1a854a312551732424e0d127a43328b8db6085 100644 --- a/paddle/fluid/operators/layer_norm_op.cc +++ b/paddle/fluid/operators/layer_norm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/layer_norm_op.h" +#include namespace paddle { namespace operators { @@ -133,7 +134,7 @@ class LayerNormGradOp : public framework::OperatorWithKernel { } if (ctx->HasOutput(framework::GradVarName("Bias"))) { ctx->SetOutputDim(framework::GradVarName("Bias"), - ctx->GetInputDim("Bias")); + ctx->GetInputDim("Scale")); } } @@ -157,12 +158,39 @@ class LayerNormGradOp : public framework::OperatorWithKernel { } }; +class LayerNormGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("layer_norm_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Mean", Output("Mean")); + op->SetInput("Variance", Output("Variance")); + if (ForwardOp().Inputs().count("Scale") > 0) { + op->SetInput("Scale", Input("Scale")); + op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale")); + } + + if (ForwardOp().Inputs().count("Bias") > 0) { + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + } + + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::LayerNormGradOpDescMaker); REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp); REGISTER_OP_CPU_KERNEL( layer_norm, ops::LayerNormKernel, diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h index f564a103963bd93732165596712230b0f37f7f26..db794ed42116144f310b9d7dc529cff49ba2c405 100644 --- a/paddle/fluid/operators/layer_norm_op.h +++ b/paddle/fluid/operators/layer_norm_op.h @@ -230,8 +230,8 @@ class LayerNormKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(bias->numel(), right); auto ker = - jit::Get, platform::CPUPlace>( - right); + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(right); ker(x.data(), out.data(), mean->data(), var->data(), scale->data(), bias->data(), static_cast(left), static_cast(epsilon), right); @@ -245,11 +245,9 @@ class LayerNormGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { const float epsilon = ctx.Attr("epsilon"); auto x = *ctx.Input("X"); - auto* y = ctx.Input("Y"); auto* mean = ctx.Input("Mean"); auto* var = ctx.Input("Variance"); auto* scale = ctx.Input("Scale"); - auto* bias = ctx.Input("Bias"); auto d_y = *ctx.Input(framework::GradVarName("Y")); const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); @@ -275,18 +273,13 @@ class LayerNormGradKernel : public framework::OpKernel { x.Resize(matrix_shape); temp.mutable_data(matrix_shape, ctx.GetPlace()); - if (!(bias && scale)) { - temp_norm.ShareDataWith(*y); - temp_norm.Resize(matrix_shape); - } else { - temp_norm.mutable_data(matrix_shape, ctx.GetPlace()); - // get x_norm - ElementwiseComputeEx, DeviceContext, T>( - ctx, &x, mean, /*axis*/ 0, SubFunctor(), &temp_norm); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &temp_norm, var, /*axis*/ 0, - DivAndSqrtFunctor(static_cast(epsilon)), &temp_norm); - } + temp_norm.mutable_data(matrix_shape, ctx.GetPlace()); + // get x_norm + ElementwiseComputeEx, DeviceContext, T>( + ctx, &x, mean, /*axis*/ 0, SubFunctor(), &temp_norm); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp_norm, var, /*axis*/ 0, + DivAndSqrtFunctor(static_cast(epsilon)), &temp_norm); } if (d_bias) { diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc index f5c802986e0573e81b3ab6187b57657b52b37215..63d3f809f263588bc1fbcd9ee4305e2ce9321e38 100644 --- a/paddle/fluid/operators/load_combine_op.cc +++ b/paddle/fluid/operators/load_combine_op.cc @@ -11,89 +11,27 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device_context.h" + +#include +#include + +#include "paddle/fluid/operators/load_combine_op.h" namespace paddle { namespace operators { -class LoadCombineOp : public framework::OperatorBase { +class LoadCombineOp : public framework::OperatorWithKernel { public: - LoadCombineOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto filename = Attr("file_path"); - auto load_as_fp16 = Attr("load_as_fp16"); - auto model_from_memory = Attr("model_from_memory"); - auto out_var_names = Outputs("Out"); - PADDLE_ENFORCE_GT( - static_cast(out_var_names.size()), 0, - "The number of output variables should be greater than 0."); - if (!model_from_memory) { - std::ifstream fin(filename, std::ios::binary); - PADDLE_ENFORCE(static_cast(fin), - "Cannot open file %s for load_combine op", filename); - LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names); - } else { - PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory"); - std::stringstream fin(filename, std::ios::in | std::ios::binary); - LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names); - } - } - void LoadParamsFromBuffer( - const framework::Scope &scope, const platform::Place &place, - std::istream *buffer, bool load_as_fp16, - const std::vector &out_var_names) const { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - for (size_t i = 0; i < out_var_names.size(); i++) { - auto *out_var = scope.FindVar(out_var_names[i]); - - PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found", - out_var_names[i]); - - auto *tensor = out_var->GetMutable(); - - // Error checking - PADDLE_ENFORCE(static_cast(*buffer), "Cannot read more"); - - // Get data from fin to tensor - DeserializeFromStream(*buffer, tensor, dev_ctx); - - auto in_dtype = tensor->type(); - auto out_dtype = - load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - - if (in_dtype != out_dtype) { - // convert to float16 tensor - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor fp16_tensor; - // copy LoD info to the new tensor - fp16_tensor.set_lod(tensor->lod()); - framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, - &fp16_tensor); - - // reset output tensor - out_var->Clear(); - tensor = out_var->GetMutable(); - tensor->set_lod(fp16_tensor.lod()); - tensor->ShareDataWith(fp16_tensor); - } - } - buffer->peek(); - PADDLE_ENFORCE(buffer->eof(), - "You are not allowed to load partial data via " - "load_combine_op, use load_op instead."); + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override {} + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = framework::OpKernelType( + framework::proto::VarType::FP32, ctx.GetPlace()); + return kt; } }; @@ -124,21 +62,31 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( LoadCombine Operator. -LoadCombine operator loads LoDTensor variables from a file, which could be -loaded in memory already. The file should contain one or more LoDTensors +LoadCombine operator loads LoDTensor variables from a file, which could be +loaded in memory already. The file should contain one or more LoDTensors serialized using the SaveCombine operator. The -LoadCombine operator applies a deserialization strategy to appropriately load -the LodTensors, and this strategy complements the serialization strategy used +LoadCombine operator applies a deserialization strategy to appropriately load +the LodTensors, and this strategy complements the serialization strategy used in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled -with the SaveCombine operator, and can only deserialize one or more LoDTensors +with the SaveCombine operator, and can only deserialize one or more LoDTensors that were saved using the SaveCombine operator. )DOC"); } }; + } // namespace operators } // namespace paddle + namespace ops = paddle::operators; REGISTER_OPERATOR(load_combine, ops::LoadCombineOp, ops::LoadCombineOpProtoMaker); + +REGISTER_OP_CPU_KERNEL( + load_combine, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel); diff --git a/paddle/fluid/operators/load_combine_op.cu b/paddle/fluid/operators/load_combine_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..2a42c0daa7fc58165e85d851c602a65ec287c905 --- /dev/null +++ b/paddle/fluid/operators/load_combine_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/load_combine_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + load_combine, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel); diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8f620ba7d2f1c2797ad4fd76a16af9aeee9c2806 --- /dev/null +++ b/paddle/fluid/operators/load_combine_op.h @@ -0,0 +1,102 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +template +class LoadCombineOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto place = ctx.GetPlace(); + auto filename = ctx.Attr("file_path"); + auto load_as_fp16 = ctx.Attr("load_as_fp16"); + auto model_from_memory = ctx.Attr("model_from_memory"); + auto &out_var_names = ctx.Outputs("Out"); + + PADDLE_ENFORCE_GT( + static_cast(out_var_names.size()), 0, + "The number of output variables should be greater than 0."); + if (!model_from_memory) { + std::ifstream fin(filename, std::ios::binary); + PADDLE_ENFORCE(static_cast(fin), + "Cannot open file %s for load_combine op", filename); + LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names); + } else { + PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory"); + std::stringstream fin(filename, std::ios::in | std::ios::binary); + LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names); + } + } + + void LoadParamsFromBuffer( + const framework::ExecutionContext &context, const platform::Place &place, + std::istream *buffer, bool load_as_fp16, + const std::vector &out_var_names) const { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + auto out_vars = context.MultiOutputVar("Out"); + + for (size_t i = 0; i < out_var_names.size(); i++) { + PADDLE_ENFORCE(out_vars[i] != nullptr, + "Output variable %s cannot be found", out_var_names[i]); + + auto *tensor = out_vars[i]->GetMutable(); + + // Error checking + PADDLE_ENFORCE(static_cast(*buffer), "Cannot read more"); + + // Get data from fin to tensor + DeserializeFromStream(*buffer, tensor, dev_ctx); + + auto in_dtype = tensor->type(); + auto out_dtype = + load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + + if (in_dtype != out_dtype) { + // convert to float16 tensor + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor fp16_tensor; + // copy LoD info to the new tensor + fp16_tensor.set_lod(tensor->lod()); + framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, + &fp16_tensor); + + // reset output tensor + out_vars[i]->Clear(); + tensor = out_vars[i]->GetMutable(); + tensor->set_lod(fp16_tensor.lod()); + tensor->ShareDataWith(fp16_tensor); + } + } + buffer->peek(); + PADDLE_ENFORCE(buffer->eof(), + "You are not allowed to load partial data via " + "load_combine_op, use load_op instead."); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index 4bce4eba22e4a8900f8d12454fd233e17c9ad617..656728c609eb19f90390d9dec72d9e30fd3040fd 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -11,89 +11,26 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/profiler.h" +#include + +#include "paddle/fluid/operators/load_op.h" namespace paddle { namespace operators { -class LoadOp : public framework::OperatorBase { +class LoadOp : public framework::OperatorWithKernel { public: - LoadOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - // FIXME(yuyang18): We save variable to local file now, but we should change - // it to save an output stream. - auto filename = Attr("file_path"); - std::ifstream fin(filename, std::ios::binary); - PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", - filename); + using framework::OperatorWithKernel::OperatorWithKernel; - auto out_var_name = Output("Out"); - auto *out_var = scope.FindVar(out_var_name); - PADDLE_ENFORCE(out_var != nullptr, - "Output variable %s cannot be found in scope %p", - out_var_name, &scope); + void InferShape(framework::InferShapeContext *ctx) const override {} - if (out_var->IsType()) { - LoadLodTensor(fin, place, out_var); - } else if (out_var->IsType()) { - LoadSelectedRows(fin, place, out_var); - } else { - PADDLE_ENFORCE( - false, - "Load only support LoDTensor and SelectedRows, %s has wrong type", - out_var_name); - } - } - - void LoadLodTensor(std::istream &fin, const platform::Place &place, - framework::Variable *var) const { - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - auto *tensor = var->GetMutable(); - DeserializeFromStream(fin, tensor, dev_ctx); - - auto load_as_fp16 = Attr("load_as_fp16"); - auto in_dtype = tensor->type(); - auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - - if (in_dtype != out_dtype) { - // convert to float16 tensor - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor fp16_tensor; - // copy LoD info to the new tensor - fp16_tensor.set_lod(tensor->lod()); - framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, - &fp16_tensor); - - // reset output tensor - var->Clear(); - tensor = var->GetMutable(); - tensor->set_lod(fp16_tensor.lod()); - tensor->ShareDataWith(fp16_tensor); - } - } - - void LoadSelectedRows(std::istream &fin, const platform::Place &place, - framework::Variable *var) const { - auto *selectedRows = var->GetMutable(); - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - framework::DeserializeFromStream(fin, selectedRows, dev_ctx); - selectedRows->SyncIndex(); + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = framework::OpKernelType( + framework::proto::VarType::FP32, platform::CPUPlace()); + return kt; } }; @@ -116,8 +53,16 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { "file."); } }; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker); + +REGISTER_OP_CPU_KERNEL( + load, ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel); diff --git a/paddle/fluid/operators/load_op.cu b/paddle/fluid/operators/load_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..90f78110f8f349ebc834570c4fb9f15af24b144d --- /dev/null +++ b/paddle/fluid/operators/load_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/load_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + load, ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel); diff --git a/paddle/fluid/operators/load_op.h b/paddle/fluid/operators/load_op.h new file mode 100644 index 0000000000000000000000000000000000000000..3bf3c6bed2f0ddf352a2bad65b0d710097016b28 --- /dev/null +++ b/paddle/fluid/operators/load_op.h @@ -0,0 +1,102 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { +template +class LoadOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto place = ctx.GetPlace(); + // FIXME(yuyang18): We save variable to local file now, but we should change + // it to save an output stream. + auto filename = ctx.Attr("file_path"); + std::ifstream fin(filename, std::ios::binary); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", + filename); + + auto out_var_name = ctx.Outputs("Out").data(); + auto *out_var = ctx.OutputVar("Out"); + + PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found ", + out_var_name); + + PADDLE_ENFORCE(out_var != nullptr, "Output variable cannot be found "); + + if (out_var->IsType()) { + LoadLodTensor(fin, place, out_var, ctx); + } else if (out_var->IsType()) { + LoadSelectedRows(fin, place, out_var); + } else { + PADDLE_ENFORCE( + false, + "Load only support LoDTensor and SelectedRows, %s has wrong type", + out_var_name); + } + } + + void LoadLodTensor(std::istream &fin, const platform::Place &place, + framework::Variable *var, + const framework::ExecutionContext &ctx) const { + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + auto *tensor = var->GetMutable(); + DeserializeFromStream(fin, tensor, dev_ctx); + + auto load_as_fp16 = ctx.Attr("load_as_fp16"); + auto in_dtype = tensor->type(); + auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + + if (in_dtype != out_dtype) { + // convert to float16 tensor + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor fp16_tensor; + // copy LoD info to the new tensor + fp16_tensor.set_lod(tensor->lod()); + framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, + &fp16_tensor); + + // reset output tensor + var->Clear(); + tensor = var->GetMutable(); + tensor->set_lod(fp16_tensor.lod()); + tensor->ShareDataWith(fp16_tensor); + } + } + + void LoadSelectedRows(std::istream &fin, const platform::Place &place, + framework::Variable *var) const { + auto *selectedRows = var->GetMutable(); + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + framework::DeserializeFromStream(fin, selectedRows, dev_ctx); + selectedRows->SyncIndex(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc index 166952fe23192799443ef9c9d1f7ba5056d19290..0a43ac0c52f9bc98eacf743480166682482cc3c0 100644 --- a/paddle/fluid/operators/lod_rank_table_op.cc +++ b/paddle/fluid/operators/lod_rank_table_op.cc @@ -64,11 +64,9 @@ class LoDRankTableInferShape : public framework::InferShapeBase { class LoDRankTableInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - for (auto &o : op_desc.Output("Out")) { - block->FindRecursiveOrCreateVar(o).SetType( - framework::proto::VarType::LOD_RANK_TABLE); + void operator()(framework::InferVarTypeContext *ctx) const override { + for (auto &o : ctx->Output("Out")) { + ctx->SetType(o, framework::proto::VarType::LOD_RANK_TABLE); } } }; diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc index 7c8fe5fbd7629b2d82552135bc1b052dfbabeba0..e0ab02cd90cdee848250a6aba882b0cb0c17abd7 100644 --- a/paddle/fluid/operators/lod_reset_op.cc +++ b/paddle/fluid/operators/lod_reset_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/lod_reset_op.h" +#include namespace paddle { namespace operators { @@ -32,7 +33,10 @@ class LoDResetOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_GT(level0.size(), 1, "If Input(Y) not provided, the target lod should be " "specified by attribute `target_lod`."); + } else { + ctx->ShareLoD("Y", "Out"); } + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); } @@ -143,18 +147,39 @@ class LoDResetGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); + } +}; + +class LoDResetGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("lod_reset_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("X", Input("X")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LoDResetGradNoNeedBufferVarInference, + "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp); + ops::LoDResetGradDescMaker); +REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp, + ops::LoDResetGradNoNeedBufferVarInference); REGISTER_OP_CPU_KERNEL( lod_reset, ops::LoDResetKernel, ops::LoDResetKernel, diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc index 9b91cf526016307653d42990e56104ea082fb8b4..61e342737045616112d51b7753939286a31dc6cd 100644 --- a/paddle/fluid/operators/lod_tensor_to_array_op.cc +++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc @@ -201,10 +201,9 @@ class LoDTensorToArrayInferShape : public framework::InferShapeBase { class LoDTensorToArrayInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - for (auto &out_var : op_desc.Output("Out")) { - block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY); + void operator()(framework::InferVarTypeContext *ctx) const override { + for (auto &out_var : ctx->Output("Out")) { + ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY); } } }; diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index 0029932bc068c7f61ddb41cf3f87c9e1a5cd7749..d635fc617bc63e1f625e93d21886f6ad134947f6 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -33,7 +33,7 @@ class LookupTableOp : public framework::OperatorWithKernel { auto table_dims = ctx->GetInputDim("W"); auto ids_dims = ctx->GetInputDim("Ids"); int ids_rank = ids_dims.size(); - + VLOG(5) << "ids rank is " << ids_rank << std::endl; PADDLE_ENFORCE_EQ(table_dims.size(), 2); PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1, "The last dimension of the 'Ids' tensor must be 1."); @@ -147,22 +147,20 @@ class LookupTableOpGrad : public framework::OperatorWithKernel { class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { - auto out_var_name = op_desc.Output(framework::GradVarName("W")).front(); - auto attr = op_desc.GetAttr("is_sparse"); + void operator()(framework::InferVarTypeContext* ctx) const override { + auto out_var_name = ctx->Output(framework::GradVarName("W")).front(); + auto attr = ctx->GetAttr("is_sparse"); bool is_sparse = boost::get(attr); if (is_sparse) { VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") << " is set to SelectedRows"; - block->Var(out_var_name) - ->SetType(framework::proto::VarType::SELECTED_ROWS); + ctx->SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS); } else { VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") << " is set to LoDTensor"; - block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); + ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR); } - block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); + ctx->SetDataType(out_var_name, ctx->GetDataType(ctx->Input("W")[0])); } }; diff --git a/paddle/fluid/operators/math.h b/paddle/fluid/operators/math.h new file mode 100644 index 0000000000000000000000000000000000000000..8cc24200d37dffaff1deda2f0181e5875141add0 --- /dev/null +++ b/paddle/fluid/operators/math.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/hostdevice.h" + +#include "math.h" // NOLINT + +namespace paddle { +namespace operators { + +inline HOSTDEVICE platform::float16 real_exp(platform::float16 x) { + return static_cast(::expf(static_cast(x))); +} + +inline HOSTDEVICE float real_exp(float x) { return ::expf(x); } + +inline HOSTDEVICE double real_exp(double x) { return ::exp(x); } + +inline HOSTDEVICE platform::float16 real_log(platform::float16 x) { + return static_cast(::logf(static_cast(x))); +} + +inline HOSTDEVICE float real_log(float x) { return ::logf(x); } + +inline HOSTDEVICE double real_log(double x) { return ::log(x); } + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc index 69971ef7423eff6bc3f8543a491edb6b0bbd00ca..0155ef188ef967fbf67505d28beeeaf956bb3a70 100644 --- a/paddle/fluid/operators/math/beam_search.cc +++ b/paddle/fluid/operators/math/beam_search.cc @@ -56,15 +56,15 @@ class BeamSearchFunctor { // the output tensor shape should be [num_instances, 1] auto dims = framework::make_ddim( std::vector({static_cast(num_instances), 1})); - selected_ids->Resize(dims); - selected_scores->Resize(dims); - parent_idx->Resize({static_cast(num_instances)}); - auto *selected_ids_data = - selected_ids->mutable_data(platform::CPUPlace()); + selected_ids->mutable_data(dims, platform::CPUPlace()); auto *selected_scores_data = - selected_scores->mutable_data(platform::CPUPlace()); - auto *parent_idx_data = parent_idx->mutable_data(platform::CPUPlace()); + selected_scores->mutable_data(dims, platform::CPUPlace()); + auto *parent_idx_data = + parent_idx + ? parent_idx->mutable_data( + {static_cast(num_instances)}, platform::CPUPlace()) + : nullptr; // fill in data std::vector low_level; @@ -72,7 +72,9 @@ class BeamSearchFunctor { for (auto &items : selected_items) { low_level.push_back(low_offset); for (auto &item : items) { - parent_idx_data[low_offset] = static_cast(low_level.size() - 1); + if (parent_idx) { + parent_idx_data[low_offset] = static_cast(low_level.size() - 1); + } selected_ids_data[low_offset] = item.id; selected_scores_data[low_offset] = item.score; low_offset++; diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu index d66778a6fe05c0460c805581ee6ffd6d5e9d746e..ecfeba338482a99735488fec08be8c3adcf4d0f4 100644 --- a/paddle/fluid/operators/math/beam_search.cu +++ b/paddle/fluid/operators/math/beam_search.cu @@ -168,6 +168,7 @@ __device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local, return finish_flag; } +template __device__ __forceinline__ void WriteBack( int64_t* selected_ids, float* selected_scores, int* parent_idx, size_t* selected_offsets, Triple* top_beam_local, @@ -183,7 +184,9 @@ __device__ __forceinline__ void WriteBack( selected_ids[global_index] = static_cast(top_beam_local[local_index].id); selected_scores[global_index] = top_beam_local[local_index].score; - parent_idx[global_index] = static_cast(global_offset); + if (ReturnParentIdx) { + parent_idx[global_index] = static_cast(global_offset); + } global_index++; } } @@ -241,9 +244,15 @@ __device__ void BeamSearchDetails( selected_offsets[0] = 0; } - WriteBack(selected_ids, selected_scores, parent_idx, selected_offsets, - top_beam_local, seq_offset_start, seq_offset_end, - selected_seq_start, selected_seq_length); + if (parent_idx) { + WriteBack(selected_ids, selected_scores, parent_idx, + selected_offsets, top_beam_local, seq_offset_start, + seq_offset_end, selected_seq_start, selected_seq_length); + } else { + WriteBack(selected_ids, selected_scores, parent_idx, + selected_offsets, top_beam_local, seq_offset_start, + seq_offset_end, selected_seq_start, selected_seq_length); + } } } @@ -337,8 +346,12 @@ class BeamSearchFunctor { selected_ids->mutable_data(selected_dims, context.GetPlace()); float* selected_scores_data = selected_scores->mutable_data(selected_dims, context.GetPlace()); - int* parent_idx_data = parent_idx->mutable_data( - {static_cast(num_seqs * beam_size)}, context.GetPlace()); + int* parent_idx_data = + parent_idx + ? parent_idx->mutable_data( + {static_cast(num_seqs * beam_size)}, + context.GetPlace()) + : nullptr; framework::LoD selected_lod(2); selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end()); @@ -396,7 +409,9 @@ class BeamSearchFunctor { {static_cast(selected_lod[1].back()), 1}); selected_ids->Resize(final_selected_dims); selected_scores->Resize(final_selected_dims); - parent_idx->Resize({static_cast(selected_lod[1].back())}); + if (parent_idx) { + parent_idx->Resize({static_cast(selected_lod[1].back())}); + } } } }; diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu index cb200ec8d6ea533d546f3e01a16a48c88b14f677..44cbdf2e9882195819bc3ca047dbac6e2fa4e631 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -20,17 +21,6 @@ namespace paddle { namespace operators { namespace math { -namespace { - -__device__ __forceinline__ float real_log(float x) { return logf(x); } - -__device__ __forceinline__ double real_log(double x) { return log(x); } - -__device__ __forceinline__ platform::float16 real_log( - const platform::float16& val) { - return static_cast(logf(static_cast(val))); -} - template __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, const int N, const int D, @@ -61,7 +51,6 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, Y[blockIdx.x] = -val; } } -} // namespace template class CrossEntropyFunctor { diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index 0ad57c51be79cd3577b43c9af777bff710308fac..66ce57594a14d8c94737b5dbe83af413628ef1cf 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -30,17 +30,16 @@ inline void FCCompute(const BlasT& blas, const int M, return; } if (relu) { - auto compute = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(N); + auto compute = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + N); for (int i = 0; i < M; i++) { T* dst = Y + i * N; compute(B, dst, dst, N); } } else { - auto compute = jit::KernelFuncs, - platform::CPUPlace>::Cache() - .At(N); + auto compute = + jit::KernelFuncs, platform::CPUPlace>::Cache().At(N); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu index 035e10dcbe4e2083723e47d7dda75ce267a9f141..1b433067900af71bb8a6833cef019d41f9c76858 100644 --- a/paddle/fluid/operators/math/sequence_padding.cu +++ b/paddle/fluid/operators/math/sequence_padding.cu @@ -78,12 +78,6 @@ class PaddingLoDTensorFunctor { "The numel of 'pad_value' can only be 1 or be equal to the " "'step_width'."); - if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) { - TensorCopy(seq_tensor, context.GetPlace(), context, pad_tensor); - pad_tensor->Resize(pad_tensor_dims); - return; - } - const int kBlockSize = 512; /* At least use 32 threads to copy sequence_width elements, @@ -129,12 +123,13 @@ class UnpaddingLoDTensorFunctor { CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len, step_width, layout); - + /* if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) { TensorCopy(pad_tensor, context.GetPlace(), context, seq_tensor); seq_tensor->Resize(seq_tensor_dims); return; } + */ const int kBlockSize = 512; diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 2a47502614b9cd3df4583992669ab4bf78228181..7af44f2b2ca56f615ca0c8ad4590958af2abe9eb 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -256,8 +256,8 @@ class SequencePoolFunctor { static_cast(input.numel() / input.dims()[0]), jit::SeqPoolType::kSum); auto seqpool = - jit::Get, platform::CPUPlace>( - attr); + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(attr); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { attr.h = static_cast(lod[i + 1] - lod[i]); seqpool(src, dst, &attr); diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index a1cb3f972826a67721b00ce6df0ec48cc34d6e03..d77b6712c548370a99e350b73ab86b170c0e17dc 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -82,8 +82,7 @@ class SoftmaxFunctor> { const int kClassDim = 1; // 2D data. Batch x C auto compute_softmax = - jit::KernelFuncs, - platform::CPUPlace>::Cache() + jit::KernelFuncs, platform::CPUPlace>::Cache() .At(in_dims[kClassDim]); compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]); } diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 242a1b9ae92ade0caf1b0f1fcb5458b8b7070d84..f18282745200cc8ef9460e60728d777112f2b798 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -290,8 +290,10 @@ class MatMulOp : public framework::OperatorWithKernel { context->Attrs().Get("transpose_Y")); PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_); - PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ || - mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0); + if (context->IsRuntime()) { + PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ || + mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0); + } std::vector dim_out; if (mat_dim_x.batch_size_ != 0) { dim_out = framework::vectorize(dim_x); diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index 54c6a71111a2cc2f9e5004922ae5d3541a9d0a70..97387af92ffbd123ae6e795f17ef2273dadeab9d 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/concat_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace operators { @@ -38,15 +39,20 @@ static void EnforceLayouts(const std::vector inputs) { } static memory::primitive_desc CreateMemPrimDesc(const Tensor& input, - const mkldnn::engine& engine) { - constexpr auto data_type = mkldnn::memory::f32; + const mkldnn::engine& engine, + const memory::data_type& dt) { const auto dims = paddle::framework::vectorize2int(input.dims()); const auto format = input.format(); - auto description = memory::desc(dims, data_type, format); + auto description = memory::desc(dims, dt, format); auto mem_prim_desc = memory::primitive_desc(description, engine); return mem_prim_desc; } +static mkldnn::memory::format GetDstMemFormat( + const concat::primitive_desc& concat_pd) { + return (memory::format)concat_pd.dst_primitive_desc().desc().data.format; +} + static platform::CPUPlace GetCpuPlace( const paddle::framework::ExecutionContext& ctx) { auto place = ctx.GetPlace(); @@ -61,14 +67,30 @@ static const mkldnn::engine& GetMKLDNNEngine( return dev_ctx.GetEngine(); } +std::string CreateKey(const paddle::framework::ExecutionContext& ctx, + const std::vector multi_input, + const int64_t& concat_axis, const memory::data_type& dt) { + std::string key; + key.reserve(platform::MKLDNNHandler::MaxKeyLength); + for (size_t i = 0; i < multi_input.size(); i++) { + platform::MKLDNNHandler::AppendKeyDims( + &key, paddle::framework::vectorize2int(multi_input[i]->dims())); + } + platform::MKLDNNHandler::AppendKey(&key, std::to_string(concat_axis)); + platform::MKLDNNHandler::AppendKey(&key, ctx.op().Output("Out")); + platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt)); + return key; +} + template class ConcatPrimitiveFactory { public: concat::primitive_desc CreateConcatPrimDescriptor( const std::vector multi_input, Tensor* output, - int concat_axis, const mkldnn::engine& mkldnn_engine) { - CreateSourcesDescriptors(multi_input, mkldnn_engine); - auto dst_desc = CreateDstMemDescriptor(output); + int concat_axis, const mkldnn::engine& mkldnn_engine, + const memory::data_type& dt = memory::data_type::f32) { + CreateSourcesDescriptors(multi_input, mkldnn_engine, dt); + auto dst_desc = CreateDstMemDescriptor(output, dt); return concat::primitive_desc(dst_desc, concat_axis, srcs_pd); } @@ -79,23 +101,39 @@ class ConcatPrimitiveFactory { return concat(concat_pd, inputs, dst_mem.get()); } + void SetSrcDataHandleByIndex(const std::vector& srcs, const size_t& i, + void* handler) { + srcs[i].set_data_handle(handler); + } + + void SetDstDataHandle(const memory& dst_mem, void* handler) { + dst_mem.set_data_handle(handler); + } + + std::vector GetSrcs() { return srcs; } + + memory GetDst() { return dst_mem.get(); } + private: - memory::desc CreateDstMemDescriptor(Tensor* output) { + memory::desc CreateDstMemDescriptor(Tensor* output, + const memory::data_type& dt) { auto dst_dims = paddle::framework::vectorize2int(output->dims()); - return memory::desc(dst_dims, platform::MKLDNNGetDataType(), - memory::format::any); + return memory::desc(dst_dims, dt, memory::format::any); } mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd, - Tensor* output, platform::CPUPlace place) { + Tensor* output, + const platform::CPUPlace& place) { return memory(concat_pd.dst_primitive_desc(), output->mutable_data(place)); } void CreateSourcesDescriptors(const std::vector multi_input, - const mkldnn::engine& mkldnn_engine) { + const mkldnn::engine& mkldnn_engine, + const memory::data_type& dt) { for (size_t i = 0; i < multi_input.size(); i++) { - auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine); + auto mem_prim_desc = + CreateMemPrimDesc(*multi_input[i], mkldnn_engine, dt); srcs_pd.push_back(mem_prim_desc); srcs.push_back( memory(mem_prim_desc, to_void_cast(multi_input[i]->data()))); @@ -120,21 +158,59 @@ template class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - auto place = GetCpuPlace(ctx); - const auto& mkldnn_engine = GetMKLDNNEngine(ctx); - auto multi_input = ctx.MultiInput("X"); EnforceLayouts(multi_input); Tensor* output = ctx.Output("Out"); int64_t concat_axis = static_cast(ctx.Attr("axis")); + auto& dev_ctx = + ctx.template device_context(); + auto place = GetCpuPlace(ctx); + + memory::data_type dt = + paddle::framework::ToMKLDNNDataType(multi_input[0]->type()); ConcatPrimitiveFactory prim_creator; - auto concat_pd = prim_creator.CreateConcatPrimDescriptor( - multi_input, output, static_cast(concat_axis), mkldnn_engine); - auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place); - stream(stream::kind::eager).submit({concat}).wait(); + std::string key = CreateKey(ctx, multi_input, concat_axis, dt); + const std::string key_prim = key + "@concat_p"; + const std::string key_concat_pd = key + "@concat_pd"; + const std::string key_srcs = key + "@concat_srcs"; + const std::string key_dst = key + "@concat_dst"; + + std::shared_ptr concat_pd; + std::shared_ptr> srcs; + std::shared_ptr dst_mem; + auto concat_p = std::static_pointer_cast(dev_ctx.GetBlob(key_prim)); + + if (concat_p == nullptr) { + const auto& mkldnn_engine = dev_ctx.GetEngine(); + concat_pd = std::make_shared( + prim_creator.CreateConcatPrimDescriptor(multi_input, output, + static_cast(concat_axis), + mkldnn_engine, dt)); + concat_p = std::make_shared( + prim_creator.CreateConcatPrimitive(*concat_pd, output, place)); + srcs = std::make_shared>(prim_creator.GetSrcs()); + dst_mem = std::make_shared(prim_creator.GetDst()); + dev_ctx.SetBlob(key_prim, concat_p); + dev_ctx.SetBlob(key_concat_pd, concat_pd); + dev_ctx.SetBlob(key_srcs, srcs); + dev_ctx.SetBlob(key_dst, dst_mem); + } else { + srcs = std::static_pointer_cast>( + dev_ctx.GetBlob(key_srcs)); + dst_mem = std::static_pointer_cast(dev_ctx.GetBlob(key_dst)); + concat_pd = std::static_pointer_cast( + dev_ctx.GetBlob(key_concat_pd)); + for (size_t i = 0; i < multi_input.size(); i++) { + prim_creator.SetSrcDataHandleByIndex( + *srcs, i, to_void_cast(multi_input[i]->data())); + } + prim_creator.SetDstDataHandle(*dst_mem, output->mutable_data(place)); + } + + stream(stream::kind::eager).submit({*concat_p}).wait(); - output->set_mkldnn_prim_desc(concat_pd.dst_primitive_desc()); + output->set_mkldnn_prim_desc(concat_pd->dst_primitive_desc()); } }; } // namespace operators @@ -143,4 +219,6 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace, - ops::ConcatMKLDNNOpKernel) + ops::ConcatMKLDNNOpKernel, + ops::ConcatMKLDNNOpKernel, + ops::ConcatMKLDNNOpKernel); diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 14ca3e8073b9512732876e512a30968b15884495..8d96ae7e4215c2488564322e1dda46a81b46a665 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -592,6 +592,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { platform::SetDstMemoryHandler(ctx, output, handler, &dst_memory_p); } else { + need_s8_to_u8 = fuse_relu; platform::SetDstMemoryHandler(ctx, output, handler, &dst_memory_p); } diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index 3a926a716f54a094eba11d63c3b29de27dff274b..69c0486eb63475d759b6869f55d14ef1bec08b59 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -123,7 +123,7 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel { auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); - auto input = ctx.Input("Input"); + auto input = ctx.Input("Input"); auto w = ctx.Input("W"); auto bias = ctx.Input("Bias"); @@ -151,7 +151,13 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel { const T* input_data = input->data(); const T* w_data = w->data(); - auto output = ctx.Output("Out"); + auto output = ctx.Output("Out"); + int in_num_col_dims = ctx.Attr("in_num_col_dims"); + std::vector output_dims; + FCOutputSize(input->dims(), w->dims(), output_dims, in_num_col_dims); + output->Resize(framework::make_ddim(output_dims)); + output->set_lod(input->lod()); + T* output_data = output->mutable_data(ctx.GetPlace()); auto dst_memory = mem.dst(output_data); @@ -204,19 +210,21 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel { Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); Tensor* w_grad = ctx.Output(framework::GradVarName("W")); + const Tensor* input = ctx.Input("Input"); + const T* input_data = input->data(); + + const Tensor* w = ctx.Input("W"); + const T* w_data = w->data(); + if (input_grad) { + input_grad->Resize(input->dims()); input_grad_data = input_grad->mutable_data(ctx.GetPlace()); } if (w_grad) { + w_grad->Resize(w->dims()); w_grad_data = w_grad->mutable_data(ctx.GetPlace()); } - const Tensor* input = ctx.Input("Input"); - const T* input_data = input->data(); - - const Tensor* w = ctx.Input("W"); - const T* w_data = w->data(); - const Tensor* out_grad = ctx.Input(framework::GradVarName("Out")); const T* out_grad_data = out_grad->data(); diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc index e41bfb80dfc0452955f7978f74ccfea184886b69..4debc7ca5ec90d6cc781d10e817e9ed8650f12aa 100644 --- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc @@ -73,6 +73,29 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { } }; +template +class TransposeINT8MKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + std::vector axis = ctx.Attr>("axis"); + std::vector axis_int8 = {0, 2, 3, 1}; + if (axis.size() != 1) { + PADDLE_ENFORCE_EQ(axis.size(), axis_int8.size()); + for (size_t i = 0; i < axis.size(); i++) { + PADDLE_ENFORCE_EQ(axis[i], axis_int8[i], + "Current INT8 MKLDNN Transpose kernel only surpport " + "axis with [0, 2, 3, 1] due to MKL-DNN kernel " + "implementation."); + } + } + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + output->ShareDataWith(*input); + output->set_layout(DataLayout::kMKLDNN); + output->set_format(input->format()); + } +}; + template class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: @@ -140,7 +163,10 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace, - ops::TransposeMKLDNNOpKernel); + ops::TransposeMKLDNNOpKernel, + ops::TransposeINT8MKLDNNOpKernel, + ops::TransposeINT8MKLDNNOpKernel); + REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace, ops::TransposeMKLDNNOpKernel); diff --git a/paddle/fluid/operators/nccl/nccl_op.cc b/paddle/fluid/operators/nccl/nccl_op.cc index 0018139cb06fe0573565c920849843e674df6f4c..6a0ae0dede695d80508bcc92a7a13ae9f73c3c57 100644 --- a/paddle/fluid/operators/nccl/nccl_op.cc +++ b/paddle/fluid/operators/nccl/nccl_op.cc @@ -60,12 +60,9 @@ class NCCLInitOp : public framework::OperatorBase { class NCCLInitOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - auto out_var_name = op_desc.Output("Communicator").front(); - auto &out_var = block->FindRecursiveOrCreateVar(out_var_name); - auto var_type = framework::proto::VarType::RAW; - out_var.SetType(var_type); + void operator()(framework::InferVarTypeContext *ctx) const override { + auto out_var_name = ctx->Output("Communicator").front(); + ctx->SetType(out_var_name, framework::proto::VarType::RAW); } }; diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 256da34912560ddf1f7e430e8543efe00e5885bc..fa7cc58c08455457dd129afd130067704ec72c7c 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -237,23 +237,21 @@ class NCEOpGrad : public framework::OperatorWithKernel { class NCEOpGradVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front(); + void operator()(framework::InferVarTypeContext *ctx) const override { + auto weight_grad = ctx->Output(framework::GradVarName("Weight")).front(); - auto attr = op_desc.GetAttr("is_sparse"); + auto attr = ctx->GetAttr("is_sparse"); bool is_sparse = boost::get(attr); if (is_sparse) { VLOG(3) << "nce_op_grad op " << weight_grad << " and " << " is set to SelectedRows"; - block->Var(weight_grad) - ->SetType(framework::proto::VarType::SELECTED_ROWS); + ctx->SetType(weight_grad, framework::proto::VarType::SELECTED_ROWS); } else { VLOG(3) << "nce_op_grad op " << weight_grad << " and " << " is set to LoDTensor"; - block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR); + ctx->SetType(weight_grad, framework::proto::VarType::LOD_TENSOR); } - block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType()); + ctx->SetDataType(weight_grad, ctx->GetDataType(ctx->Input("Input")[0])); } }; diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc index 41037d9039bb53038af80eafa269ee9246dc9980..9f73bbc1fdc72766a0b57bc72c62d208277c2f20 100644 --- a/paddle/fluid/operators/ngraph/ngraph_engine.cc +++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc @@ -29,7 +29,6 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/operators/ngraph/ngraph_bridge.h" #include "paddle/fluid/operators/ngraph/ngraph_engine.h" @@ -42,130 +41,199 @@ static ngraph::Shape Ddim2Shape(const framework::DDim& dims) { for (int i = 0; i < dims.size(); ++i) { int k = dims[i]; k = k == 0 ? 1 : k; - sp.push_back(k); + sp.emplace_back(k); } return sp; } +static framework::DDim Shape2Ddim(const ngraph::Shape& shape) { + std::vector dims; + for (size_t i = 0; i < shape.size(); ++i) { + int64_t k = shape[i]; + dims.emplace_back(k); + } + return framework::make_ddim(dims); +} + static std::map pd2ng_type_map = { {framework::proto::VarType::FP32, ngraph::element::f32}, {framework::proto::VarType::FP64, ngraph::element::f64}, {framework::proto::VarType::INT32, ngraph::element::i32}, {framework::proto::VarType::INT64, ngraph::element::i64}, - {framework::proto::VarType::BOOL, ngraph::element::boolean}, -}; - -std::unordered_map> - NgraphEngine::func_cache_ = {}; + {framework::proto::VarType::BOOL, ngraph::element::boolean}}; + +static std::map + ng2pd_type_map = { + {ngraph::element::f32, framework::proto::VarType::FP32}, + {ngraph::element::f64, framework::proto::VarType::FP64}, + {ngraph::element::i32, framework::proto::VarType::INT32}, + {ngraph::element::i64, framework::proto::VarType::INT64}, + {ngraph::element::boolean, framework::proto::VarType::BOOL}}; + +std::vector NgraphEngine::feed_vars = {}; +std::vector NgraphEngine::fetch_vars = {}; +framework::Variable* NgraphEngine::pre_var_ptr = nullptr; +const framework::BlockDesc* NgraphEngine::p_bdesc = nullptr; + +std::unordered_map NgraphEngine::engine_cache = {}; +std::unordered_map>> + NgraphEngine::t_in_cache_ = {}; std::shared_ptr NgraphEngine::backend_ = ngraph::runtime::Backend::create("CPU"); static std::vector> NgraphOpIntervals( - framework::BlockDesc* block) { + std::vector>* ops) { + NgraphEngine::feed_vars.clear(); + NgraphEngine::fetch_vars.clear(); std::vector> intervals; - auto ops = block->AllOps(); - int size = ops.size(); + + int size = ops->size(); int left = 0; - while (left < size && ops.at(left)->Type() != framework::kFeedOpType) { + while (left < size && ops->at(left)->Type() != framework::kFeedOpType && + ops->at(left)->Type() != framework::kFetchOpType) { ++left; } - if (left == size) { - return intervals; - } - while (left < size && ops.at(left)->Type() == framework::kFeedOpType) { + + while (left < size && ops->at(left)->Type() == framework::kFeedOpType) { + for (auto& var_name_item : ops->at(left)->Outputs()) { + for (auto& var_name : var_name_item.second) { + NgraphEngine::feed_vars.emplace_back(var_name); + } + } ++left; } int right = left; - while (right < size && ops.at(right)->Type() != framework::kFetchOpType) { + while (right < size && ops->at(right)->Type() != framework::kFetchOpType) { ++right; } - if (right == size) { - return intervals; + + int index = right; + while (index < size && ops->at(index)->Type() == framework::kFetchOpType) { + for (auto& var_name_item : ops->at(index)->Inputs()) { + for (auto& var_name : var_name_item.second) { + NgraphEngine::fetch_vars.emplace_back(var_name); + } + } + ++index; + } + + if (left == size || ops->at(left)->Type() == framework::kFetchOpType) { + left = 0; } - if (left >= right) return intervals; // (left, right - 1) represents indices between feed and fetch int pivot = left; while (pivot < right) { - auto op_type = ops.at(pivot)->Type(); + auto op_type = ops->at(pivot)->Type(); if (NgraphBridge::isRegister(op_type)) { ++pivot; } else { int start = pivot, end = start; while (pivot < right && - (!NgraphBridge::isRegister(ops.at(pivot)->Type()))) { + (!NgraphBridge::isRegister(ops->at(pivot)->Type()))) { ++pivot; ++end; } std::vector interval = {start, end}; - intervals.push_back(interval); + intervals.emplace_back(interval); } } // end while return intervals; } -static void SubstituteNgraphOp(framework::BlockDesc* block, - std::string block_str, - std::vector interval) { - framework::ProgramDesc program; - block->RemoveOp(interval.at(0), interval.at(1)); - auto* ng_op = block->InsertOp(interval.at(0)); - ng_op->SetType("ngraph_engine"); - ng_op->SetAttr("interval", interval); - ng_op->SetAttr("graph", block_str); +static void SubstituteNgraphOp( + std::vector>* ops, + std::string engine_key, std::string block_str, std::vector interval) { + framework::OpDesc ng_op_desc(nullptr); + ng_op_desc.SetType("ngraph_engine"); + ng_op_desc.SetAttr("interval", interval); + ng_op_desc.SetAttr("engine_key", engine_key); + ng_op_desc.SetAttr("graph", block_str); + + ops->erase(ops->begin() + interval[0], ops->begin() + interval[1]); + ops->insert(ops->begin() + interval[0], + framework::OpRegistry::CreateOp(ng_op_desc)); } -// TODO(baojun-nervana): Move EnableNgraph to compile time per PR #15089 -void NgraphEngine::EnableNgraph(const framework::ProgramDesc& program) { -#ifdef PADDLE_WITH_NGRAPH - VLOG(4) << "use_ngraph=True"; - for (size_t bid = 0; bid < program.Size(); ++bid) { - // TODO(baojun-nervana): Remove the const_cast - auto* block = - const_cast(program).MutableBlock(bid); - std::string block_str = block->Proto()->SerializeAsString(); - auto intervals = NgraphOpIntervals(block); - for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { - SubstituteNgraphOp(block, block_str, *it); - } +std::string SerializedBlock(const std::vector& op_descs) { + framework::proto::BlockDesc block_proto; + framework::BlockDesc block_desc(nullptr, &block_proto); + block_desc.Proto()->set_parent_idx(-1); + block_desc.Proto()->set_idx(0); + + for (auto* op_desc : op_descs) { + auto* op = block_desc.AppendOp(); + *op->Proto() = *op_desc->Proto(); + } + return block_desc.Proto()->SerializeAsString(); +} + +std::string GenerateEngineKey(const framework::BlockDesc& bdesc) { + framework::proto::BlockDesc block_proto; + framework::BlockDesc block_desc(nullptr, &block_proto); + block_desc.Proto()->set_parent_idx(-1); + block_desc.Proto()->set_idx(0); + + for (auto& op_desc : bdesc.AllOps()) { + auto* op = block_desc.AppendOp(); + *op->Proto() = *op_desc->Proto(); + } + auto engine_key = std::to_string( + std::hash()(block_desc.Proto()->SerializeAsString())); + return engine_key; +} + +std::string GenerateEngineKey(const std::vector& engine_inputs, + const std::vector& engine_outputs, + int size) { + std::string engine_hash_key = ""; + for (auto name : engine_inputs) { + engine_hash_key += name; + } + for (auto name : engine_outputs) { + engine_hash_key += name; + } + engine_hash_key += std::to_string(size); + auto engine_key = std::to_string(std::hash()(engine_hash_key)); + return engine_key; +} + +void NgraphEngine::FuseNgraphOps( + const framework::BlockDesc& block_desc, + std::vector>* ops) { + NgraphEngine::p_bdesc = &block_desc; + auto intervals = NgraphOpIntervals(ops); + std::string engine_key = + GenerateEngineKey(feed_vars, fetch_vars, ops->size()); + for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { + SubstituteNgraphOp(ops, engine_key, "", *it); } -#else - LOG(WARNING) - << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option"; -#endif } NgraphEngine::NgraphEngine(const framework::Scope& scope, const platform::Place& place, - const std::string& serialized_graph, - const std::vector& interval) + const framework::ExecutionContext& ctx) : scope_(scope), place_(place) { + std::string serialized_graph = ctx.Attr("graph"); + auto interval = ctx.Attr>("interval"); + std::string engine_key = ctx.Attr("engine_key"); + var_in_node_map_ = std::make_shared< std::unordered_map>>(); var_node_map_ = std::make_shared< std::unordered_map>>(); - func_cache_key_ = std::to_string(interval[0]) + std::to_string(interval[1]) + - serialized_graph; - - framework::proto::BlockDesc bdesc; - bdesc.ParseFromString(serialized_graph); - framework::BlockDesc block(nullptr, &bdesc); - - Prepare(block, interval); - - BuildNgIO(); - - GetNgFunction(); + GetNgFunction(engine_key, interval); } -void NgraphEngine::Prepare(const framework::BlockDesc& block, - const std::vector& interval) { - for (auto& var : block.AllVars()) { +void NgraphEngine::Prepare(const std::vector& interval) { + bool has_fetch = false, is_full = false; + for (auto& var : p_bdesc->AllVars()) { if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS || var->GetType() == framework::proto::VarType::LOD_TENSOR || var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) { @@ -192,125 +260,79 @@ void NgraphEngine::Prepare(const framework::BlockDesc& block, } } - auto ops_desc = block.AllOps(); - int idx = interval[0]; - while (idx < interval[1]) { - auto op_desc = ops_desc.at(idx); - auto op = framework::OpRegistry::CreateOp(*op_desc); - fused_ops_.push_back(std::move(op)); - ++idx; - } - - while (ops_desc.at(idx)->Type() != framework::kFetchOpType) { - auto op_desc = ops_desc.at(idx); - for (auto& var_name_item : op_desc->Inputs()) { - for (auto& var_name : var_name_item.second) { - post_op_inputs_.insert(var_name); - } + std::vector ops_desc; + for (auto op_desc : p_bdesc->AllOps()) { + ops_desc.emplace_back(op_desc); + if (op_desc->Type() == framework::kFetchOpType) { + has_fetch = true; } - ++idx; - } - - while (idx < static_cast(ops_desc.size()) && - ops_desc.at(idx)->Type() == framework::kFetchOpType) { - std::string fetch_target_name = ops_desc.at(idx)->Input("X")[0]; - fetches_.insert(fetch_target_name); - ++idx; } - if (ops_desc.at(interval.at(0) - 1)->Type() == framework::kFeedOpType && - ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) { - ng_op_state_ = OpState::FULL; - } - - for (auto* op_desc : ops_desc) { + for (auto op_desc : ops_desc) { if (op_desc->Type().find("_grad") != std::string::npos) { - ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TRAIN - : OpState::PARTIAL_TRAIN; + this->is_test_ = false; break; } } - if (ng_op_state_ != OpState::FULL_TRAIN && - ng_op_state_ != OpState::PARTIAL_TRAIN) { - ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TEST - : OpState::PARTIAL_TEST; + if (interval[0] > 0 && + ops_desc.at(interval[0] - 1)->Type() == framework::kFeedOpType && + interval[1] < static_cast(ops_desc.size()) && + ops_desc.at(interval[1])->Type() == framework::kFetchOpType) { + is_full = true; } -} -void NgraphEngine::GetNgInputShape( - std::shared_ptr op) { - framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); - op->RuntimeInferShape(scope_, place_, ctx); - for (auto& var_name_item : op->Inputs()) { - for (auto& var_name : var_name_item.second) { - auto* var = scope_.FindVar(var_name); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - auto sp = Ddim2Shape(tensor_pd->dims()); - if (std::find(var_in_.begin(), var_in_.end(), var_name) != - var_in_.end()) { - if (var_node_map_->find(var_name) == var_node_map_->end()) { - // auto ng_type = pd2ng_type_map.at(GetDataTypeOfVar(var)); - auto ng_type = var_type_map_.at(var_name); - auto prm = - std::make_shared(ng_type, sp, true); - (*var_node_map_)[var_name] = prm; - (*var_in_node_map_)[var_name] = prm; - } - } - } - } + if (is_full) { + this->op_state_ = this->is_test_ ? OpState::FULL_TEST : OpState::FULL_TRAIN; + } else { + this->op_state_ = + this->is_test_ ? OpState::PARTIAL_TEST : OpState::PARTIAL_TRAIN; } -} -void NgraphEngine::BuildNgNodes() { - for (auto& op : fused_ops_) { - for (auto& var_name_item : op->Outputs()) { + int idx = interval[0]; + while (idx < interval[1]) { + this->fused_ops_.emplace_back( + framework::OpRegistry::CreateOp(*(ops_desc[idx]))); + ++idx; + } + while (idx < static_cast(ops_desc.size()) && + ops_desc.at(idx)->Type() != framework::kFetchOpType) { + auto op_desc = ops_desc.at(idx); + for (auto& var_name_item : op_desc->Inputs()) { for (auto& var_name : var_name_item.second) { - if (var_node_map_->find(var_name) == var_node_map_->end()) { - auto* var = scope_.FindVar(var_name); - if (var && var->IsType()) { - auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); - auto& ddim = tensor_pd->dims(); - auto ng_shape = Ddim2Shape(ddim); - auto ng_type = var_type_map_.at(var_name); - auto prm = std::make_shared(ng_type, - ng_shape, true); - (*var_node_map_)[var_name] = prm; - } - } + this->post_op_inputs_.insert(var_name); } } + ++idx; } - NgraphBridge ngb(var_node_map_); - for (auto& op : fused_ops_) { - ngb.BuildNgNode(op); + + if (!has_fetch) { + op_state_ = OpState::UNKNOWN; } + + BuildNgIO(ops_desc, interval); } -void NgraphEngine::BuildNgIO() { +void NgraphEngine::BuildNgIO(const std::vector& ops_desc, + const std::vector& interval) { std::unordered_set inputs; std::unordered_set outputs; - - for (auto& op : fused_ops_) { + for (int i = interval[0]; i < interval[1]; ++i) { + auto op = ops_desc[i]; for (auto& var_name_item : op->Inputs()) { for (auto& var_name : var_name_item.second) { inputs.insert(var_name); const bool is_output = outputs.find(var_name) != outputs.end(); if (!is_output && std::find(var_in_.begin(), var_in_.end(), var_name) == - var_in_.end()) { + var_in_.end() && + scope_.FindVar(var_name)) { // fill var_in here to keep lhs and rhs order - var_in_.push_back(var_name); + this->var_in_.emplace_back(var_name); } } } - if (op->Type() != "fill_constant") { - GetNgInputShape(op); - } - for (auto& var_name_item : op->Outputs()) { PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, "op %s has more than 1 output - Not handling yet", @@ -322,172 +344,279 @@ void NgraphEngine::BuildNgIO() { } // var_out.clear(); - for (auto& op : fused_ops_) { + for (int i = interval[0]; i < interval[1]; ++i) { + auto op = ops_desc[i]; for (auto& var_name_item : op->Outputs()) { PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, "op %s has more than 1 output - Not handling yet", op->Type()); for (auto& var_name : var_name_item.second) { - switch (ng_op_state_) { + switch (this->op_state_) { case OpState::PARTIAL_TEST: if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || - fetches_.find(var_name) != fetches_.end()) { - var_out_.push_back(var_name); + find(fetch_vars.begin(), fetch_vars.end(), var_name) != + fetch_vars.end()) { + this->var_out_.emplace_back(var_name); } break; case OpState::FULL_TEST: - if (fetches_.find(var_name) != fetches_.end()) { - var_out_.push_back(var_name); + if (find(fetch_vars.begin(), fetch_vars.end(), var_name) != + fetch_vars.end()) { + this->var_out_.emplace_back(var_name); } break; case OpState::PARTIAL_TRAIN: - if (fetches_.find(var_name) != fetches_.end() || + if (find(fetch_vars.begin(), fetch_vars.end(), var_name) != + fetch_vars.end() || post_op_inputs_.find(var_name) != post_op_inputs_.end() || persistables_.find(var_name) != persistables_.end()) { - var_out_.push_back(var_name); + this->var_out_.emplace_back(var_name); } break; case OpState::FULL_TRAIN: - if (fetches_.find(var_name) != fetches_.end() || + if (find(fetch_vars.begin(), fetch_vars.end(), var_name) != + fetch_vars.end() || persistables_.find(var_name) != persistables_.end()) { - var_out_.push_back(var_name); + this->var_out_.emplace_back(var_name); } break; default: - var_out_.push_back(var_name); + this->var_out_.emplace_back(var_name); } } } } + + for (size_t i = 0; i < var_in_.size(); ++i) { + auto var_name = var_in_[i]; + if (persistables_.find(var_name) == persistables_.end()) { + var_in_updates_.emplace_back(i); + } + } } -void NgraphEngine::BuildNgFunction() { +void NgraphEngine::GetNgInputShape() { + for (auto& var_name : var_in_) { + auto* var = scope_.FindVar(var_name); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto sp = Ddim2Shape(tensor_pd->dims()); + auto ng_type = var_type_map_[var_name]; + auto prm = std::make_shared(ng_type, sp, true); + (*var_node_map_)[var_name] = prm; + (*var_in_node_map_)[var_name] = prm; + } + } +} + +void NgraphEngine::BuildNgNodes() { + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Outputs()) { + for (auto& var_name : var_name_item.second) { + if (var_node_map_->find(var_name) == var_node_map_->end()) { + auto* var = scope_.FindVar(var_name); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto& ddim = tensor_pd->dims(); + auto ng_shape = Ddim2Shape(ddim); + auto ng_type = var_type_map_[var_name]; + auto prm = std::make_shared(ng_type, + ng_shape, true); + (*var_node_map_)[var_name] = prm; + } + } + } + } + } + + NgraphBridge ngb(var_node_map_); + for (auto& op : fused_ops_) { + ngb.BuildNgNode(op); + } +} + +void NgraphEngine::RunInferShape() { + for (auto& op : fused_ops_) { + framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); + op->RuntimeInferShape(scope_, place_, ctx); + } +} + +void NgraphEngine::BuildNgFunction(const std::vector& interval) { + Prepare(interval); + RunInferShape(); + GetNgInputShape(); BuildNgNodes(); ngraph_function_ = nullptr; ngraph::NodeVector func_outputs; ngraph::ParameterVector func_inputs; for (auto& vo : var_out_) { - func_outputs.push_back(var_node_map_->at(vo)); + func_outputs.emplace_back(var_node_map_->at(vo)); } for (auto& vi : var_in_) { std::shared_ptr prm = std::dynamic_pointer_cast( var_in_node_map_->at(vi)); - func_inputs.push_back(prm); + func_inputs.emplace_back(prm); } ngraph_function_ = std::make_shared(func_outputs, func_inputs); } -void NgraphEngine::GetNgFunction() { - bool cache_on = true; - if (cache_on) { - std::string input_shape_str; - for (auto& var_name : var_in_) { - auto shape = var_node_map_->at(var_name)->get_shape(); - for (size_t i = 0; i < shape.size(); ++i) { - input_shape_str += std::to_string(shape.at(i)); +void NgraphEngine::GetNgFunction(std::string engine_key, + const std::vector& interval) { + bool use_cache = true; + if (use_cache) { + this->func_cache_key_ = ""; + for (int i = 0; i < std::min(static_cast(feed_vars.size()), 10); ++i) { + auto* var = scope_.FindVar(feed_vars[i]); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto dims = tensor_pd->dims(); + for (int j = 0; j < dims.size(); ++j) { + func_cache_key_ += std::to_string(dims[j]); + } } } - func_cache_key_ = input_shape_str + func_cache_key_; - if (func_cache_.find(func_cache_key_) != func_cache_.end()) { - ngraph_function_ = func_cache_.at(func_cache_key_); - } else { - BuildNgFunction(); - func_cache_[func_cache_key_] = ngraph_function_; + func_cache_key_ += std::to_string(interval[0]) + "_" + + std::to_string(interval[1]) + engine_key; + func_cache_key_ = std::to_string(std::hash()(func_cache_key_)); + + if (engine_cache.find(func_cache_key_) != engine_cache.end()) { + if (engine_cache[func_cache_key_].persistables.size() == 0) { + engine_cache.clear(); + t_in_cache_.clear(); + } else { + auto var_name = engine_cache[func_cache_key_].persistables.begin(); + framework::Variable* var = scope_.FindVar(*var_name); + if (var != pre_var_ptr) { + engine_cache.clear(); + t_in_cache_.clear(); + } + pre_var_ptr = var; + } + } + + if (engine_cache.find(func_cache_key_) == engine_cache.end()) { + BuildNgFunction(interval); + engine_cache[func_cache_key_].ngraph_function = this->ngraph_function_; + engine_cache[func_cache_key_].persistables = this->persistables_; + engine_cache[func_cache_key_].var_in_updates = this->var_in_updates_; + engine_cache[func_cache_key_].var_in = this->var_in_; + engine_cache[func_cache_key_].var_out = this->var_out_; + engine_cache[func_cache_key_].is_test = this->is_test_; } } else { - BuildNgFunction(); + BuildNgFunction(interval); } } void NgraphEngine::Run(const framework::Scope& scope, const platform::Place& place) const { - std::vector> t_in; - std::vector> t_out; + std::shared_ptr ng_func; + const std::set* p_persistables; + const std::vector* p_var_in_updates; + const std::vector* p_var_in; + const std::vector* p_var_out; + bool is_test; + + bool use_cache = true; + if (use_cache) { + PADDLE_ENFORCE(engine_cache.find(func_cache_key_) != engine_cache.end(), + "Cannot find cached data to run ngraph function"); + ng_func = engine_cache[func_cache_key_].ngraph_function; + p_persistables = &(engine_cache[func_cache_key_].persistables); + p_var_in_updates = &(engine_cache[func_cache_key_].var_in_updates); + p_var_in = &(engine_cache[func_cache_key_].var_in); + p_var_out = &(engine_cache[func_cache_key_].var_out); + is_test = engine_cache[func_cache_key_].is_test; + } else { + ng_func = ngraph_function_; + p_persistables = &this->persistables_; + p_var_in_updates = &this->var_in_updates_; + p_var_in = &this->var_in_; + p_var_out = &this->var_out_; + is_test = this->is_test_; + } - for (size_t i = 0; i < var_in_.size(); ++i) { - auto vi = var_in_.at(i); - auto sp = var_node_map_->at(vi)->get_shape(); - std::shared_ptr ti; - auto* var = scope.FindVar(vi); - if (var && var->IsType()) { - auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); - PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), - "Ensure ngraph tensor layout align with paddle tensor"); - auto ng_type = var_type_map_.at(vi); - if (ng_type == ngraph::element::f32) { - auto pd_arr = tensor_pd->mutable_data(place); - ti = backend_->create_tensor(ngraph::element::f32, sp, pd_arr); - } else if (ng_type == ngraph::element::i32) { - const int* arr = tensor_pd->data(); - ti = backend_->create_tensor(ngraph::element::i32, sp, - const_cast(arr)); - } else if (ng_type == ngraph::element::i64) { - auto pd_arr = tensor_pd->mutable_data(place); - ti = backend_->create_tensor(ngraph::element::i64, sp, pd_arr); - } else if (ng_type == ngraph::element::f64) { - auto pd_arr = tensor_pd->mutable_data(place); - ti = backend_->create_tensor(ngraph::element::f64, sp, pd_arr); - } else if (ng_type == ngraph::element::boolean) { - auto pd_arr = tensor_pd->mutable_data(place); - ti = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr); + std::vector>* p_t_in; + std::vector> t_in = {}; + + auto m_parameters = ng_func->get_parameters(); + auto m_results = ng_func->get_results(); + if (is_test && use_cache && + t_in_cache_.find(func_cache_key_) != t_in_cache_.end()) { + p_t_in = &(t_in_cache_[func_cache_key_]); + for (size_t i = 0; i < p_var_in_updates->size(); ++i) { + int index = p_var_in_updates->at(i); + auto vi = p_var_in->at(index); + auto sp = m_parameters[index]->get_shape(); + auto ng_type = m_parameters[index]->get_element_type(); + std::shared_ptr ti; + auto* var = scope.FindVar(vi); + if (var && var->IsType()) { + auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); + void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]); + ti = backend_->create_tensor(ng_type, sp, pd_arr); + (*p_t_in)[index] = ti; } else { - PADDLE_THROW("Data type not handling for var %s", vi); + PADDLE_THROW("Cannot find var or tensor with var name %s", vi); } + } + } else { + if (is_test && use_cache) { + p_t_in = &(t_in_cache_[func_cache_key_]); } else { - PADDLE_THROW("Cannot find var or tensor with var name %s", vi); + p_t_in = &t_in; } - bool is_test = (ng_op_state_ == OpState::PARTIAL_TEST || - ng_op_state_ == OpState::FULL_TEST) - ? true - : false; - bool is_persistable = - (persistables_.find(vi) != persistables_.end()) ? true : false; - if (is_test && is_persistable) { - ti->set_stale(false); + + for (size_t i = 0; i < p_var_in->size(); ++i) { + auto vi = p_var_in->at(i); + auto sp = m_parameters[i]->get_shape(); + auto ng_type = m_parameters[i]->get_element_type(); + std::shared_ptr ti; + auto* var = scope.FindVar(vi); + if (var && var->IsType()) { + auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); + void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]); + PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), + "Ensure ngraph tensor layout align with paddle tensor"); + ti = backend_->create_tensor(ng_type, sp, pd_arr); + } else { + PADDLE_THROW("Cannot find var or tensor with var name %s", vi); + } + bool is_persistable = + (p_persistables->find(vi) != p_persistables->end()) ? true : false; + if (is_test && is_persistable) { + ti->set_stale(false); + } + (*p_t_in).emplace_back(ti); } - t_in.push_back(ti); } - for (size_t i = 0; i < var_out_.size(); ++i) { - auto vo = var_out_[i]; + std::vector> t_out = {}; + for (size_t i = 0; i < p_var_out->size(); ++i) { + auto vo = p_var_out->at(i); auto* var = scope.FindVar(vo); - std::shared_ptr to; if (var && var->IsType()) { + auto sp = m_results[i]->get_shape(); + var->GetMutable()->Resize(Shape2Ddim(sp)); auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); - auto dd = tensor_pd->dims(); - ngraph::Shape sp = Ddim2Shape(dd); - auto ng_type = var_type_map_.at(vo); - if (ng_type == ngraph::element::f32) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ng_type, sp, pd_arr); - } else if (ng_type == ngraph::element::i64) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ng_type, sp, pd_arr); - } else if (ng_type == ngraph::element::i32) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ng_type, sp, pd_arr); - } else if (ng_type == ngraph::element::f64) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ng_type, sp, pd_arr); - } else if (ng_type == ngraph::element::boolean) { - auto pd_arr = tensor_pd->mutable_data(place); - to = backend_->create_tensor(ng_type, sp, pd_arr); - } else { - PADDLE_THROW("Data type not handled in for var %s", vo); - } - t_out.push_back(to); + auto ng_type = m_results[i]->get_element_type(); + void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]); + std::shared_ptr to = + backend_->create_tensor(ng_type, sp, pd_arr); + t_out.emplace_back(to); } else { PADDLE_THROW("Cannot find var or tensor with var name %s", vo); } } - auto handle = backend_->compile(ngraph_function_); - handle->call_with_validate(t_out, t_in); + auto handle = backend_->compile(ng_func); + handle->call_with_validate(t_out, *p_t_in); } // NgraphEngine::Run } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h index bf5ff2a743b0edb69163e674d36c56a02c0b4153..b6532519e947bc59f0605c4f2008270f5e51b0e0 100644 --- a/paddle/fluid/operators/ngraph/ngraph_engine.h +++ b/paddle/fluid/operators/ngraph/ngraph_engine.h @@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma once + +#include +#include #include #include +#include #include #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_desc.h" #include "ngraph/ngraph.hpp" @@ -29,33 +35,50 @@ enum class OpState { /* nGraph support state on ops */ PARTIAL_TRAIN, /* Support partial ops for train */ FULL_TEST, /* Support full list of ops for test */ PARTIAL_TEST, /* Support partial list of ops for test */ - FULL, /* All ops supported from feed to fetch */ UNKNOWN /* Output all for debug purpose */ }; +// cache engine repetitives +struct EngineCache { + std::shared_ptr ngraph_function; + std::set persistables; + std::vector var_in; + std::vector var_out; + std::vector var_in_updates; + bool is_test = true; +}; + // perform graph build through bridge and execute computation class NgraphEngine { public: explicit NgraphEngine(const framework::Scope& scope, const platform::Place& place, - const std::string& serialized_graph, - const std::vector& interval); + const framework::ExecutionContext& ctx); void Run(const framework::Scope& scope, const platform::Place& place) const; - static void EnableNgraph(const framework::ProgramDesc& program); + static const framework::BlockDesc* p_bdesc; + static std::vector feed_vars, fetch_vars; + + static void FuseNgraphOps( + const framework::BlockDesc& prog, + std::vector>* ops); private: - static std::unordered_map> - func_cache_; + static std::unordered_map engine_cache; + static std::unordered_map< + std::string, std::vector>> + t_in_cache_; + static framework::Variable* pre_var_ptr; + const framework::Scope& scope_; const platform::Place& place_; std::vector> fused_ops_; std::unordered_map var_type_map_; - std::unordered_set persistables_; - std::unordered_set fetches_; + std::set persistables_; std::unordered_set post_op_inputs_; - OpState ng_op_state_ = OpState::UNKNOWN; + OpState op_state_ = OpState::UNKNOWN; + bool is_test_{true}; std::string func_cache_key_; // ngraph backend eg. CPU @@ -66,6 +89,8 @@ class NgraphEngine { std::vector var_in_; // var_name of outputs from fetch in order std::vector var_out_; + // non-persitable var_in + std::vector var_in_updates_; // map input vars to nodes std::shared_ptr< std::unordered_map>> @@ -74,19 +99,21 @@ class NgraphEngine { std::shared_ptr< std::unordered_map>> var_node_map_; - // prepare info for nraph engine - void Prepare(const framework::BlockDesc& block, - const std::vector& interval); + // prepare info for ngraph engine need + void Prepare(const std::vector& interval); + // get ngraph engine input and output list + void BuildNgIO(const std::vector& op_descs, + const std::vector& interval); // get ngraph input and define ngraph input parameters - void GetNgInputShape(std::shared_ptr op); + void GetNgInputShape(); // Call ngraph bridge to map ops void BuildNgNodes(); - // get the ngraph input and output var list - void BuildNgIO(); + // run paddle RuntimeInferShape to get the tensor shape + void RunInferShape(); // build ngraph function call - void BuildNgFunction(); + void BuildNgFunction(const std::vector& interval); // Check cache for ngraph function or otherwise build the function - void GetNgFunction(); + void GetNgFunction(std::string engine_key, const std::vector& interval); }; } // namespace operators diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc index 3051ca123b29658d3e9a35239ad00f621a297cb5..479c95ba08c316be3d1d983ea736fcc505332d6e 100644 --- a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc +++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc @@ -29,6 +29,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Xs", "A list of inputs.").AsDispensable(); AddOutput("Ys", "A list of outputs").AsDispensable(); AddAttr("graph", "the graph."); + AddAttr("engine_key", "the engine hash key."); AddAttr>("interval", "op interval supported by ngraph"); AddComment("ngraph engine operator."); } @@ -36,8 +37,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker { class NgraphEngineInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override {} + void operator()(framework::InferVarTypeContext *ctx) const override {} }; } // namespace operators diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.h b/paddle/fluid/operators/ngraph/ngraph_engine_op.h index 2f194a9b8766316fc645f7e22e21fff048fb7d63..c9b2a3970e17c1a06fa0cc67aa15df304a30656e 100644 --- a/paddle/fluid/operators/ngraph/ngraph_engine_op.h +++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h @@ -46,10 +46,8 @@ class NgraphEngineKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto& scope = ctx.scope(); auto place = ctx.GetPlace(); - std::string serialized_graph = ctx.Attr("graph"); - auto interval = ctx.Attr>("interval"); - NgraphEngine ngraph_engine(scope, place, serialized_graph, interval); + NgraphEngine ngraph_engine(scope, place, ctx); ngraph_engine.Run(scope, place); } }; diff --git a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h index be36b9d21ef6ebe5c11d783462e7dc564afe2aba..c92ebb7e96fa22f8fd463c5837134cd74542766c 100644 --- a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h +++ b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h @@ -27,13 +27,9 @@ namespace paddle { namespace operators { namespace ngraphs { -void BuildCrossEntropyNode( - const std::shared_ptr& op, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); - auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map); +std::shared_ptr GetCrossEntropy( + std::shared_ptr x, std::shared_ptr label, + const bool is_soft_label, int ignore_index) { auto label_shape = label->get_shape(); auto x_shape = x->get_shape(); auto label_rank = label_shape.size(); @@ -46,18 +42,16 @@ void BuildCrossEntropyNode( label_2d = paddle::platform::NgReshaper(label, label_2d_shape); } if (x_rank > 2) { - x_2d_shape = paddle::platform::FlattenTo2d(x_shape, x_rank - 1); - x_2d = paddle::platform::NgReshaper(x, x_2d_shape); + x_2d_shape = platform::FlattenTo2d(x_shape, x_rank - 1); + x_2d = platform::NgReshaper(x, x_2d_shape); } auto batch_size = x_2d_shape.at(0); - auto op_attrs = paddle::framework::AttrReader(op->Attrs()); - const bool is_soft_label = op_attrs.Get("soft_label"); std::shared_ptr node_1_hot = label_2d; if (!is_soft_label) { - auto label_1d = paddle::platform::NgReshaper( - label_2d, ngraph::Shape{label_2d_shape.at(0)}); + auto label_1d = + platform::NgReshaper(label_2d, ngraph::Shape{label_2d_shape.at(0)}); node_1_hot = std::make_shared(label_1d, x_2d_shape, 1); } if (x->get_element_type() != node_1_hot->get_element_type()) { @@ -76,11 +70,9 @@ void BuildCrossEntropyNode( auto node_sum = std::make_shared(node_mul, ngraph::AxisSet{1}); auto node_neg = std::make_shared(node_sum); - auto xe = - paddle::platform::NgReshaper(node_neg, ngraph::Shape{batch_size, 1}); + auto xe = platform::NgReshaper(node_neg, ngraph::Shape{batch_size, 1}); if (!is_soft_label) { - auto ignore_index = op_attrs.Get("ignore_index"); auto ignore_node = ngraph::op::Constant::create( label->get_element_type(), label_2d_shape, {ignore_index}); auto not_equal_node = @@ -89,21 +81,13 @@ void BuildCrossEntropyNode( xe->get_element_type()); xe = xe * mask; } - - paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map); + return xe; } -void BuildCrossEntropyGradNode( - const std::shared_ptr& op, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - auto op_attrs = paddle::framework::AttrReader(op->Attrs()); - const bool is_soft_label = op_attrs.Get("soft_label"); - - auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); - auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map); - auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map); +std::shared_ptr GetCrossEntropyGrad( + std::shared_ptr x, std::shared_ptr label, + std::shared_ptr dy, const bool is_soft_label, + int ignore_index) { auto x_shape = x->get_shape(); auto rank = x_shape.size(); @@ -111,9 +95,8 @@ void BuildCrossEntropyGradNode( if (!is_soft_label) { auto label_shape = label->get_shape(); label_shape.pop_back(); - label = paddle::platform::NgReshaper(label, label_shape); + label = platform::NgReshaper(label, label_shape); - auto ignore_index = op_attrs.Get("ignore_index"); auto ignore_node = ngraph::op::Constant::create( label->get_element_type(), label_shape, {ignore_index}); auto not_equal_node = @@ -128,7 +111,7 @@ void BuildCrossEntropyGradNode( auto dy_shape = dy->get_shape(); dy_shape.pop_back(); - auto dy_reshape = paddle::platform::NgReshaper(dy, dy_shape); + auto dy_reshape = platform::NgReshaper(dy, dy_shape); auto dy_bcast = std::make_shared( dy_reshape, x_shape, ngraph::AxisSet{rank - 1}); if (x->get_element_type() != label->get_element_type()) { @@ -140,7 +123,35 @@ void BuildCrossEntropyGradNode( if (!is_soft_label) { xe_grad = xe_grad * mask; } + return xe_grad; +} +void BuildCrossEntropyNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map); + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + const bool is_soft_label = op_attrs.Get("soft_label"); + int ignore_index = op_attrs.Get("ignore_index"); + auto xe = GetCrossEntropy(x, label, is_soft_label, ignore_index); + paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map); +} + +void BuildCrossEntropyGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + const bool is_soft_label = op_attrs.Get("soft_label"); + int ignore_index = op_attrs.Get("ignore_index"); + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map); + auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map); + auto xe_grad = GetCrossEntropyGrad(x, label, dy, is_soft_label, ignore_index); paddle::platform::SetOutputNode(op, "X@GRAD", xe_grad, ngb_node_map); } } // namespace ngraphs diff --git a/paddle/fluid/operators/ngraph/ops/softmax_op.h b/paddle/fluid/operators/ngraph/ops/softmax_op.h index 7d5720c460c4194ce06670a715b8d7ff4435bb2a..174b7a91a8dd0e3edb06f224c3914e24c6c4a96d 100644 --- a/paddle/fluid/operators/ngraph/ops/softmax_op.h +++ b/paddle/fluid/operators/ngraph/ops/softmax_op.h @@ -27,12 +27,7 @@ namespace paddle { namespace operators { namespace ngraphs { -void BuildSoftmaxNode( - const std::shared_ptr& op, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); +std::shared_ptr GetSoftmax(std::shared_ptr x) { auto x_shape = x->get_shape(); int rank = x_shape.size(); auto x_2d_shape = paddle::platform::FlattenTo2d(x_shape, rank - 1); @@ -47,16 +42,11 @@ void BuildSoftmaxNode( -64., x_shifted); auto softmax = std::make_shared(x_clipped, ngraph::AxisSet{1}); - paddle::platform::SetOutputNode(op, "Out", softmax, ngb_node_map); + return softmax; } -void BuildSoftmaxGradNode( - const std::shared_ptr& op, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map); - auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map); +std::shared_ptr GetSoftmaxGrad( + std::shared_ptr out, std::shared_ptr dout) { auto out_shape = out->get_shape(); int rank = out_shape.size(); auto out_2d_shape = paddle::platform::FlattenTo2d(out_shape, rank - 1); @@ -70,6 +60,27 @@ void BuildSoftmaxGradNode( auto node_bcast = std::make_shared( node_sum, out_2d_shape, ngraph::AxisSet{1}); auto dx = (dout - node_bcast) * out; + return dx; +} + +void BuildSoftmaxNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto softmax = GetSoftmax(x); + paddle::platform::SetOutputNode(op, "Out", softmax, ngb_node_map); +} + +void BuildSoftmaxGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map); + auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map); + auto dx = GetSoftmaxGrad(out, dout); paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map); } } // namespace ngraphs diff --git a/paddle/fluid/operators/ngraph/ops/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/ngraph/ops/softmax_with_cross_entropy_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a6bdf4de9522e08caf4a9ae606db8277f98cdab3 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/softmax_with_cross_entropy_op.h @@ -0,0 +1,90 @@ +/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/cross_entropy_op.h" +#include "paddle/fluid/operators/ngraph/ops/softmax_op.h" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildSoftmaxWithCrossEntropyNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto logits = paddle::platform::GetInputNode(op, "Logits", ngb_node_map); + auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map); + auto softmax = paddle::operators::ngraphs::GetSoftmax(logits); + + auto op_attrs = framework::AttrReader(op->Attrs()); + const bool is_soft_label = op_attrs.Get("soft_label"); + int ignore_index = op_attrs.Get("ignore_index"); + auto xe = paddle::operators::ngraphs::GetCrossEntropy( + softmax, label, is_soft_label, ignore_index); + + paddle::platform::SetOutputNode(op, "Softmax", softmax, ngb_node_map); + paddle::platform::SetOutputNode(op, "Loss", xe, ngb_node_map); +} + +void BuildSoftmaxWithCrossEntropyGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = framework::AttrReader(op->Attrs()); + const bool is_soft_label = op_attrs.Get("soft_label"); + auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map); + auto softmax = paddle::platform::GetInputNode(op, "Softmax", ngb_node_map); + auto loss_grad = + paddle::platform::GetInputNode(op, "Loss@GRAD", ngb_node_map); + auto softmax_shape = softmax->get_shape(); + auto rank = softmax_shape.size(); + if (!is_soft_label) { + auto label_shape = label->get_shape(); + label_shape.pop_back(); + label = platform::NgReshaper(label, label_shape); + + label = + std::make_shared(label, softmax_shape, rank - 1); + } + + auto loss_grad_shape = loss_grad->get_shape(); + loss_grad_shape.pop_back(); + auto loss_grad_reshape = platform::NgReshaper(loss_grad, loss_grad_shape); + auto loss_grad_bcast = std::make_shared( + loss_grad_reshape, softmax_shape, ngraph::AxisSet{rank - 1}); + if (softmax->get_element_type() != label->get_element_type()) { + label = std::make_shared(label, + softmax->get_element_type()); + } + + auto logits_grad = loss_grad_bcast * (softmax - label); + paddle::platform::SetOutputNode(op, "Logits@GRAD", logits_grad, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle + +REGISTER_NG_OP(softmax_with_cross_entropy, BuildSoftmaxWithCrossEntropyNode); +REGISTER_NG_OP(softmax_with_cross_entropy_grad, + BuildSoftmaxWithCrossEntropyGradNode); diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 09255f60e6953734680cc9b008504fabc5589cf0..6262ef0c2d3802bca574ba1312e7cf4a720403ef 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include // for sqrt in CPU and CUDA #include +#include #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/threadpool.h" @@ -311,17 +312,17 @@ struct SparseAdamFunctor { T beta1_pow = *beta1_pow_; T beta2_pow = *beta2_pow_; lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); - size_t row_count = numel / row_numel_; + int64_t row_count = static_cast(numel / row_numel_); - for (size_t i = 0U, j = 0U; i != row_count; ++i) { + for (int64_t i = 0, j = 0; i != row_count; ++i) { if (i == *(rows_ + j)) { - for (size_t k = 0U; k != row_numel_; ++k) { + for (int64_t k = 0; k != row_numel_; ++k) { T g = grad_[j * row_numel_ + k]; adam_update(i * row_numel_ + k, g); } ++j; } else { - for (size_t k = 0U; k != row_numel_; ++k) { + for (int64_t k = 0; k != row_numel_; ++k) { T mom1 = moment1_[i * row_numel_ + k]; T mom2 = moment2_[i * row_numel_ + k]; T p = param_[i * row_numel_ + k]; @@ -427,43 +428,23 @@ class AdamOpKernel : public framework::OpKernel { } } - framework::SelectedRows cpu_grad_merge; + framework::SelectedRows tmp_grad_merge; const framework::SelectedRows* grad_merge_ptr; if (is_strict_sorted) { grad_merge_ptr = &grad; } else { // merge duplicated rows if any. // The rows of grad_merge have been sorted inside MergeAdd functor - framework::SelectedRows* grad_merge_var; scatter::MergeAdd merge_func; - if (platform::is_cpu_place(ctx.GetPlace())) { - grad_merge_var = &cpu_grad_merge; - } else { - // FIXME(qiao): GPU also need to fix this - grad_merge_var = const_cast(ctx.scope()) - .Var() - ->GetMutable(); - } merge_func(ctx.template device_context(), grad, - grad_merge_var, true); - grad_merge_ptr = grad_merge_var; + &tmp_grad_merge, true); + grad_merge_ptr = &tmp_grad_merge; } auto& grad_merge = *grad_merge_ptr; auto& grad_tensor = grad_merge.value(); const T* grad_data = grad_tensor.template data(); - const int64_t* rows = nullptr; -// When compiled without CUDA, the CUDAData() interface should not be -// provided. -#if defined(PADDLE_WITH_CUDA) - if (platform::is_gpu_place(ctx.GetPlace())) { - rows = grad_merge.rows().CUDAData(ctx.GetPlace()); - } else { -#endif - rows = grad_merge.rows().data(); -#if defined(PADDLE_WITH_CUDA) - } -#endif + const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace()); auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); if (platform::is_cpu_place(ctx.GetPlace())) { @@ -488,7 +469,7 @@ class AdamOpKernel : public framework::OpKernel { } } #ifndef _WIN32 - else if (FLAGS_inner_op_parallelism > 1 && + else if (FLAGS_inner_op_parallelism > 1 && // NOLINT min_row_size_to_use_multithread > 0 && param.dims()[0] > min_row_size_to_use_multithread) { VLOG(3) << "use multi thread, inner_op_parallelism=" @@ -516,11 +497,11 @@ class AdamOpKernel : public framework::OpKernel { for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) { int64_t start = i * line_in_each_thread; int64_t end = (i + 1) * line_in_each_thread; - if (start >= param_row_count) { + if (start >= static_cast(param_row_count)) { break; } - if (end > param_row_count) { - end = param_row_count; + if (end > static_cast(param_row_count)) { + end = static_cast(param_row_count); } fs.push_back( framework::Async([&functor, &row_id_to_grad_row_offset, @@ -545,8 +526,8 @@ class AdamOpKernel : public framework::OpKernel { } for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); } -#endif // !_WIN32 - else { +#endif // !_WIN32 + else { // NOLINT functor(param.numel()); } } else if (platform::is_gpu_place(ctx.GetPlace())) { diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc index 574a03680b66962ac2d6ba249d0fc491a36794cd..126b665dd4d9301ae67346afa45a250accfec656 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc @@ -56,9 +56,9 @@ This optimizer use LARS (https://arxiv.org/abs/1708.03888) to optimize each weight using a local learning rate: $$ -local\_lr = \eta * +local\_lr = \eta * \frac{\left \| param \right \|}{\left \| grad \right \| + \beta *\left \| param \right \|} \\ -velocity = mu * velocity + +velocity = mu * velocity + local\_lr * (grad + \beta * param) \\ param = param - velocity. \\ $$ @@ -72,8 +72,7 @@ use L2 regularizers in case of using LARS. class LarsMomentumOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override {} + void operator()(framework::InferVarTypeContext* ctx) const override {} }; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/optimizers/momentum_op.cc b/paddle/fluid/operators/optimizers/momentum_op.cc index cde238c076b6991eb52dac328c3e30a045420c92..7cf218c20f4c8a22aefc8cd8ce8e1cca36dee3bf 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.cc +++ b/paddle/fluid/operators/optimizers/momentum_op.cc @@ -21,18 +21,14 @@ using Tensor = framework::Tensor; class MomentumOpInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { - auto input_var = op_desc.Input("Param")[0]; - for (auto& out_var : op_desc.Output("ParamOut")) { - if (block->FindRecursiveOrCreateVar(input_var).GetType() == - framework::proto::VarType::SELECTED_ROWS) { - block->FindRecursiveOrCreateVar(out_var).SetType( - framework::proto::VarType::SELECTED_ROWS); - } else if (block->FindRecursiveOrCreateVar(input_var).GetType() == + void operator()(framework::InferVarTypeContext* ctx) const override { + auto& input_var = ctx->Input("Param")[0]; + for (auto& out_var : ctx->Output("ParamOut")) { + if (ctx->GetType(input_var) == framework::proto::VarType::SELECTED_ROWS) { + ctx->SetType(out_var, framework::proto::VarType::SELECTED_ROWS); + } else if (ctx->GetType(input_var) == framework::proto::VarType::LOD_TENSOR) { - block->FindRecursiveOrCreateVar(out_var).SetType( - framework::proto::VarType::LOD_TENSOR); + ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR); } else { PADDLE_THROW( "Only support LodTensor and SelectedRows, Unexpected Input Type."); diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index 3ed1bff5ff4993e9c858dea8d56a8cb6124aca89..29a2ae6755aa609e4a6ee43bbf11fe02ebfa654e 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -69,6 +70,7 @@ class MomentumOp : public framework::OperatorWithKernel { ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("VelocityOut", param_dim); } + framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param")); @@ -351,23 +353,14 @@ class MomentumOpKernel : public framework::OpKernel { VLOG(3) << "Grad SelectedRows contains no data!"; return; } - auto* merged_grad = const_cast(ctx.scope()) - .Var() - ->GetMutable(); + + framework::SelectedRows tmp_merged_grad; + framework::SelectedRows* merged_grad = &tmp_merged_grad; math::scatter::MergeAdd merge_func; merge_func(ctx.template device_context(), *grad, merged_grad); - const int64_t* rows = nullptr; -#ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(ctx.GetPlace())) { - rows = merged_grad->rows().CUDAData(ctx.GetPlace()); - } else { -#endif - rows = merged_grad->rows().data(); -#ifdef PADDLE_WITH_CUDA - } -#endif + const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace()); int64_t row_numel = merged_grad->value().numel() / merged_grad->rows().size(); platform::ForRange for_range( diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h index 389c84d2464090ff9bd9e8b471cd0103c86a347a..4550052b2d614ccbbb09f4a2b9e747708b2a2baa 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op.h +++ b/paddle/fluid/operators/optimizers/rmsprop_op.h @@ -216,24 +216,14 @@ class RmspropOpKernel : public framework::OpKernel { } } else if (grad_var->IsType()) { auto &grad = grad_var->Get(); - auto *merged_grad = const_cast(ctx.scope()) - .Var() - ->GetMutable(); - + framework::SelectedRows tmp_merged_grad; + framework::SelectedRows *merged_grad = &tmp_merged_grad; math::scatter::MergeAdd merge_func; merge_func(dev_ctx, grad, merged_grad); platform::ForRange for_range(dev_ctx, limit); - const int64_t *rows; -#ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(ctx.GetPlace())) { - rows = merged_grad->rows().CUDAData(ctx.GetPlace()); - } else { -#endif - rows = merged_grad->rows().data(); -#ifdef PADDLE_WITH_CUDA - } -#endif + const int64_t *rows = merged_grad->rows().Data(ctx.GetPlace()); + auto &merged_tensor = merged_grad->value(); int64_t row_count = merged_grad->rows().size(); int64_t row_numel = merged_tensor.numel() / row_count; diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc index 690381a67f89d18fe81c3b856b7ddce25d496ed0..34e99a14ff77cf8aa7d7f58529140f21d864b596 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cc +++ b/paddle/fluid/operators/optimizers/sgd_op.cc @@ -50,20 +50,18 @@ class SGDOp : public framework::OperatorWithKernel { class SGDOpInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - auto input_var_n = op_desc.Input("Param")[0]; - auto in_var_type = block->FindRecursiveOrCreateVar(input_var_n).GetType(); + void operator()(framework::InferVarTypeContext *ctx) const override { + auto &input_var_n = ctx->Input("Param")[0]; + auto in_var_type = ctx->GetType(input_var_n); PADDLE_ENFORCE(in_var_type == framework::proto::VarType::SELECTED_ROWS || in_var_type == framework::proto::VarType::LOD_TENSOR, "The input Var's type should be LoDtensor or SelectedRows," " but the received var(%s)'s type is %s", input_var_n, in_var_type); - for (auto &out_var_n : op_desc.Output("ParamOut")) { - auto &out_var = block->FindRecursiveOrCreateVar(out_var_n); - if (out_var.GetType() != in_var_type) { - out_var.SetType(in_var_type); + for (auto &out_var_n : ctx->Output("ParamOut")) { + if (ctx->GetType(out_var_n) != in_var_type) { + ctx->SetType(out_var_n, in_var_type); } } } diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index c9c9f530fe846c1713ad176e05a377996d04470b..5dd5f67e004c63e294152239ab7bd3db26542eed 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -48,7 +48,8 @@ class SGDOpKernel : public framework::OpKernel { T *out_data = param_out->mutable_data(ctx.GetPlace()); auto sgd = - jit::Get, platform::CPUPlace>(attr); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr); } else if (grad_var->IsType()) { // TODO(qijun): In Sparse SGD operator, in-place update is enforced. @@ -82,7 +83,8 @@ class SGDOpKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width); auto sgd = - jit::Get, platform::CPUPlace>(attr); + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); sgd(lr, param_data, grad_data, rows_data, out_data, &attr); } else { PADDLE_THROW("Unsupported Variable Type of Grad"); diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 0a0ece162cc63696974383d8ed49fdd10204c331..7963c27a0153105b9ab21c7165b5e4daad8346ea 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/pool_op.h" +#include #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cudnn_helper.h" #endif @@ -212,6 +213,12 @@ void Pool2dOpMaker::Make() { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr("use_quantizer", + "(bool, default false) " + "Set to true for operators that should be quantized and use " + "int8 kernel. " + "Only used on CPU.") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc index 53eff2de3e3864b0f3d61f95ab5758b65f9eecb5..5300e807472d3bb243dc198c0bfd1bc572538015 100644 --- a/paddle/fluid/operators/py_func_op.cc +++ b/paddle/fluid/operators/py_func_op.cc @@ -14,8 +14,11 @@ #include "paddle/fluid/operators/py_func_op.h" +#include #include #include +#include +#include #include #include "paddle/fluid/framework/op_registry.h" @@ -91,15 +94,12 @@ static void CallPythonFunc(py::object *callable, } } -class PyFuncOpVarTypInference : public framework::VarTypeInference { +class PyFuncOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op, - framework::BlockDesc *block) const override { - auto &outs = op.Outputs(); - bool has_out = (outs.count("Out") > 0 && !outs.at("Out").empty()); + void operator()(framework::InferVarTypeContext *ctx) const override { + bool has_out = (ctx->HasOutput("Out") && !ctx->Output("Out").empty()); - auto &ins = op.Inputs(); - bool has_in = (ins.count("X") > 0 && !ins.at("X").empty()); + bool has_in = (ctx->HasInput("X") && !ctx->Input("X").empty()); /** * X or Out can be empty, so that py_func can be more flexible @@ -107,8 +107,8 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference { */ PADDLE_ENFORCE(has_in || has_out, "Input(X) or Output(Out) must exist"); - PADDLE_ENFORCE_GE(boost::get(op.GetAttr(kForwardPythonCallableId)), 0, - "Function id cannot be less than 0"); + PADDLE_ENFORCE_GE(boost::get(ctx->GetAttr(kForwardPythonCallableId)), + 0, "Function id cannot be less than 0"); if (!has_out) return; @@ -118,7 +118,7 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference { * the corresponding forward variable */ const std::string kGradVarSuffix = framework::kGradVarSuffix; - auto &out_var_names = outs.at("Out"); + auto &out_var_names = ctx->Output("Out"); for (auto &out_var_name : out_var_names) { if (out_var_name == framework::kEmptyVarName || out_var_name.size() < kGradVarSuffix.size()) { @@ -128,18 +128,17 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference { size_t len = out_var_name.size() - kGradVarSuffix.size(); if (out_var_name.substr(len) == kGradVarSuffix) { auto fwd_var_name = out_var_name.substr(0, len); - auto *out_var_desc = block->FindVarRecursive(out_var_name); - auto *fwd_var_desc = block->FindVarRecursive(fwd_var_name); - PADDLE_ENFORCE_NOT_NULL(out_var_desc, "Backward variable %s not found", - out_var_name); - PADDLE_ENFORCE_NOT_NULL(fwd_var_desc, "Forward variable %s not found", - fwd_var_name); + PADDLE_ENFORCE(ctx->HasVar(out_var_name), + "Backward variable %s not found", out_var_name); + PADDLE_ENFORCE(ctx->HasVar(fwd_var_name), + "Backward variable %s not found", fwd_var_name); VLOG(10) << "Infer var_desc of Output(" << out_var_name << ") as Input(" << fwd_var_name << ")"; - out_var_desc->SetShape(fwd_var_desc->GetShape()); - out_var_desc->SetDataType(fwd_var_desc->GetDataType()); - out_var_desc->SetLoDLevel(fwd_var_desc->GetLoDLevel()); - out_var_desc->SetType(fwd_var_desc->GetType()); + + ctx->SetShape(out_var_name, ctx->GetShape(fwd_var_name)); + ctx->SetDataType(out_var_name, ctx->GetDataType(fwd_var_name)); + ctx->SetLoDLevel(out_var_name, ctx->GetLoDLevel(fwd_var_name)); + ctx->SetType(out_var_name, ctx->GetType(fwd_var_name)); } } } @@ -309,5 +308,5 @@ class PyFuncOp : public framework::OperatorBase { namespace ops = paddle::operators; REGISTER_OPERATOR(py_func, ops::PyFuncOp, ops::PyFuncOpMaker, - ops::PyFuncOpVarTypInference, ops::PyFuncOpShapeInference, + ops::PyFuncOpVarTypeInference, ops::PyFuncOpShapeInference, ops::PyFuncOpGradDescMaker); diff --git a/paddle/fluid/operators/range_op.cc b/paddle/fluid/operators/range_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ee8c68fd008c8c9764e9ef74dc37fa08cf31be19 --- /dev/null +++ b/paddle/fluid/operators/range_op.cc @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/range_op.h" + +namespace paddle { +namespace operators { + +class RangeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + if (ctx->HasInput("Start")) { + auto s_dims = ctx->GetInputDim("Start"); + PADDLE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1), + "The shape of Input(Start) should be [1]."); + } + if (ctx->HasInput("End")) { + auto e_dims = ctx->GetInputDim("End"); + PADDLE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1), + "The shape of Input(End) should be [1]."); + } + if (ctx->HasInput("Step")) { + auto step_dims = ctx->GetInputDim("Step"); + PADDLE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1), + "The shape of Input(Step) should be [1]."); + } + ctx->SetOutputDim("Out", {-1}); + } +}; + +class RangeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Start", + "Start of interval. The interval includes this value. It is a " + "tensor with shape=[1]."); + AddInput("End", + "End of interval. The interval does not include this value, " + "except in some cases where step is not an integer and floating " + "point round-off affects the length of out. It is a tensor with " + "shape=[1]."); + AddInput("Step", "Spacing between values. It is a tensor with shape=[1]."); + AddOutput("Out", "A sequence of numbers."); + AddComment(R"DOC( + Return evenly spaced values within a given interval. Values are generated within the half-open interval [start, stop) (in other words, the interval including start but excluding stop). Like arange function of numpy. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker); +REGISTER_OP_CPU_KERNEL(range, ops::CPURangeKernel, + ops::CPURangeKernel, ops::CPURangeKernel, + ops::CPURangeKernel); diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..e2c03716d55ee41ce3a9053b48b5c6d4c70e391f --- /dev/null +++ b/paddle/fluid/operators/range_op.cu @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/range_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void RangeKernel(T start, T step, int64_t size, T* out) { + CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; } +} + +template +class CUDARangeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* start_t = context.Input("Start"); + auto* end_t = context.Input("End"); + auto* step_t = context.Input("Step"); + auto* out = context.Output("Out"); + + framework::Tensor n; + framework::TensorCopy(*start_t, platform::CPUPlace(), &n); + T start = n.data()[0]; + framework::TensorCopy(*end_t, platform::CPUPlace(), &n); + T end = n.data()[0]; + framework::TensorCopy(*step_t, platform::CPUPlace(), &n); + T step = n.data()[0]; + + int64_t size = 0; + GetSize(start, end, step, &size); + out->Resize(framework::make_ddim({size})); + T* out_data = out->mutable_data(context.GetPlace()); + + auto stream = context.cuda_device_context().stream(); + int block = 512; + int grid = (size + block - 1) / block; + RangeKernel<<>>(start, step, size, out_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(range, ops::CUDARangeKernel, + ops::CUDARangeKernel, + ops::CUDARangeKernel, + ops::CUDARangeKernel); diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h new file mode 100644 index 0000000000000000000000000000000000000000..fce58b45c96ad76dfdd4ed7f54becde327070002 --- /dev/null +++ b/paddle/fluid/operators/range_op.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +void GetSize(T start, T end, T step, int64_t* size) { + PADDLE_ENFORCE(!std::equal_to()(step, 0), + "The step of range op should not be 0."); + PADDLE_ENFORCE(((start < end) && (step > 0)) || ((start > end) && (step < 0)), + "The step should be greater than 0 while start < end. And the " + "step should be less than 0 while start > end."); + *size = std::is_integral::value + ? ((std::abs(end - start) + std::abs(step) - 1) / std::abs(step)) + : std::ceil(std::abs((end - start) / step)); +} + +template +class CPURangeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + T start = context.Input("Start")->data()[0]; + T end = context.Input("End")->data()[0]; + T step = context.Input("Step")->data()[0]; + auto* out = context.Output("Out"); + int64_t size = 0; + GetSize(start, end, step, &size); + out->Resize(framework::make_ddim({size})); + T* out_data = out->mutable_data(context.GetPlace()); + T value = start; + for (int64_t i = 0; i < size; ++i) { + out_data[i] = value; + value += step; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt index 7c284312df912ad758f6fffc44f111dfe765feb8..5ee1206175600cd668ccbbf5b98053708a4406d3 100644 --- a/paddle/fluid/operators/reader/CMakeLists.txt +++ b/paddle/fluid/operators/reader/CMakeLists.txt @@ -17,7 +17,9 @@ function(reader_library TARGET_NAME) PARENT_SCOPE) endfunction() +cc_library(py_reader SRCS py_reader.cc DEPS reader) cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool) + reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader) reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc) reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc) @@ -26,7 +28,7 @@ reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_o reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader) reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc) reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc) -reader_library(create_py_reader_op SRCS create_py_reader_op.cc) +reader_library(create_py_reader_op SRCS create_py_reader_op.cc DEPS py_reader) if (NOT WIN32 AND NOT ON_INFER) cc_library(ctr_reader SRCS ctr_reader.cc DEPS gzstream reader zlib) @@ -38,7 +40,7 @@ cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc) # Export local libraries to parent # set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE) -op_library(read_op) +op_library(read_op DEPS py_reader buffered_reader) foreach(src ${LOCAL_READER_LIBS}) set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs") diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 51b980acb5a08d431d96a3a92479dec09119c27e..78d238aa6115265023d5d87c01048a87180448d0 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -16,6 +16,7 @@ #include // NOLINT #include +#include #include "paddle/fluid/platform/enforce.h" @@ -34,7 +35,7 @@ class BlockingQueue { explicit BlockingQueue(size_t capacity, bool speed_test_mode = false) : capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) { PADDLE_ENFORCE_GT( - capacity_, 0, + capacity_, static_cast(0), "The capacity of a reader::BlockingQueue must be greater than 0."); } diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 52e96c4fb3a058057f5acd5e30b7a0e869aefacc..c24e9aedc4ebd8f4fa9e483b1c1cc71fe0bf0aa7 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -17,6 +17,7 @@ #include #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { namespace reader { @@ -29,8 +30,10 @@ BufferedReader::~BufferedReader() { #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { platform::SetDeviceId(boost::get(place_).device); - PADDLE_ENFORCE(cudaStreamDestroy(stream)); - for (auto &event : events) PADDLE_ENFORCE(cudaEventDestroy(event)); + PADDLE_ENFORCE(cudaStreamDestroy(stream_)); + for (auto &event : events_) { + PADDLE_ENFORCE(cudaEventDestroy(event)); + } } #endif } @@ -45,14 +48,15 @@ BufferedReader::BufferedReader( #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { platform::SetDeviceId(boost::get(place_).device); - compute_stream = + compute_stream_ = ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance() .Get(place_))) ->stream(); - events.resize(buffer_size); - for (auto &event : events) + events_.resize(buffer_size); + for (auto &event : events_) { PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + } + PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); } #endif cpu_buffer_.resize(buffer_size); @@ -71,7 +75,7 @@ void BufferedReader::ReadAsync(size_t i) { #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { platform::SetDeviceId(boost::get(place_).device); - PADDLE_ENFORCE(cudaEventRecord(events[i], compute_stream)); + PADDLE_ENFORCE(cudaEventRecord(events_[i], compute_stream_)); } #endif position_.emplace(thread_pool_.enqueue([this, i]() -> size_t { @@ -84,12 +88,15 @@ void BufferedReader::ReadAsync(size_t i) { #ifdef PADDLE_WITH_CUDA // NOTE(liangdun): using async copy instead of TensorCopySync - // TensorCopySync would block other stream + // TensorCopySync would block other stream, because TensorCopySync + // issues the copying command to the default stream, it will make two + // commands from different streams cannot run concurrently. if (platform::is_gpu_place(place_)) { platform::SetDeviceId(boost::get(place_).device); - PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0)); + PADDLE_ENFORCE(cudaStreamWaitEvent(stream_, events_[i], 0)); TensorVec &gpu = gpu_buffer_[i]; gpu.resize(cpu.size()); + platform::RecordEvent record_event("BufferedReader:MemoryCopy"); for (size_t i = 0; i < cpu.size(); ++i) { gpu[i].Resize(cpu[i].dims()); gpu[i].set_layout(cpu[i].layout()); @@ -98,23 +105,25 @@ void BufferedReader::ReadAsync(size_t i) { auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type()); auto size = cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); - if (platform::is_cuda_pinned_place(cpu_place)) + if (platform::is_cuda_pinned_place(cpu_place)) { memory::Copy(boost::get(place_), gpu_ptr, boost::get(cpu_place), - cpu_ptr, size, stream); - else if ((platform::is_gpu_place(cpu_place))) + cpu_ptr, size, stream_); + } else if ((platform::is_gpu_place(cpu_place))) { memory::Copy(boost::get(place_), gpu_ptr, boost::get(cpu_place), cpu_ptr, - size, stream); - else + size, stream_); + } else { // if cpu place is not pinned, async copy is slower than sync copy, // so we use sync copy instead. + // TODO(zcd): The default stream should not be used here. memory::Copy(boost::get(place_), gpu_ptr, boost::get(cpu_place), cpu_ptr, size, 0); + } gpu[i].set_lod(cpu[i].lod()); } - PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); } #endif return i; diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index 87680da01a1f51cfdfe4d100508440eda9d1877f..5f8b2d47c22d0a15d53c8d30d39608fd64d4bddd 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include "ThreadPool.h" @@ -63,9 +64,9 @@ class BufferedReader : public framework::DecoratedReader { std::vector gpu_buffer_; size_t prev_pos_{-1UL}; #ifdef PADDLE_WITH_CUDA - cudaStream_t stream; - cudaStream_t compute_stream; - std::vector events; + cudaStream_t stream_; + cudaStream_t compute_stream_; + std::vector events_; #endif }; diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc index 85394b336fc967fc6973131fbedda4c796825185..fdc7b0f6a0e8de232865adb70677af80eb08a174 100644 --- a/paddle/fluid/operators/reader/create_custom_reader_op.cc +++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc @@ -85,10 +85,10 @@ class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase { AddComment(R"DOC( CreateCustomReader Operator - A custom reader can be used for input data preprocessing. - A custom reader holds its own sub-block, which will be executed in CPU - in its 'ReadNext()' function. Users can configurate their own - preprocessing pipelines by inserting operators into custom reader's + A custom reader can be used for input data preprocessing. + A custom reader holds its own sub-block, which will be executed in CPU + in its 'ReadNext()' function. Users can configurate their own + preprocessing pipelines by inserting operators into custom reader's sub-block. )DOC"); } @@ -123,23 +123,22 @@ class CustomReaderInferShape : public framework::InferShapeBase { class CustomReaderInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { - framework::VarDesc* out_reader = block->FindVar(op_desc.Output("Out")[0]); - PADDLE_ENFORCE_NOT_NULL(out_reader); - out_reader->SetType(framework::proto::VarType::READER); + void operator()(framework::InferVarTypeContext* ctx) const override { + auto& out_var_name = ctx->Output("Out")[0]; + PADDLE_ENFORCE(ctx->HasVar(out_var_name)); + ctx->SetType(out_var_name, framework::proto::VarType::READER); auto sink_var_names = - boost::get>(op_desc.GetAttr("sink_var_names")); + boost::get>(ctx->GetAttr("sink_var_names")); const auto* sub_block = - boost::get(op_desc.GetAttr("sub_block")); + boost::get(ctx->GetAttr("sub_block")); std::vector res_data_types; for (const std::string& var_name : sink_var_names) { framework::VarDesc* var = sub_block->FindVar(var_name); PADDLE_ENFORCE_NOT_NULL(var); res_data_types.emplace_back(var->GetDataType()); } - out_reader->SetDataTypes(res_data_types); + ctx->SetDataTypes(out_var_name, res_data_types); } }; diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc index 901a92ab5b5c74b071be8b57a7653d90e2a4fb29..4a6581bbbd00019db33896371adac6d4e420e48c 100644 --- a/paddle/fluid/operators/reader/create_py_reader_op.cc +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -12,37 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/operators/reader/py_reader.h" #include "paddle/fluid/operators/reader/reader_op_registry.h" namespace paddle { namespace operators { namespace reader { -class PyReader : public framework::FileReader { - public: - explicit PyReader(const std::shared_ptr& queue) - : framework::FileReader() { - PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); - queue_ = queue; - } - - void ReadNext(std::vector* out) override { - bool success; - *out = queue_->Pop(&success); - if (!success) out->clear(); - } - - ~PyReader() { queue_->Close(); } - - void Shutdown() override { queue_->Close(); } - - void Start() override { queue_->ReOpen(); } - - private: - std::shared_ptr queue_; -}; - class CreatePyReaderOp : public framework::OperatorBase { public: using framework::OperatorBase::OperatorBase; diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 740cd5219c70331d1f71d832adef084c148a2408..0860fb845976c02562a181139e27bd1912a7c179 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -152,7 +153,7 @@ class CTRReader : public framework::FileReader { queue_->ReOpen(); VLOG(3) << "reopen success"; VLOG(3) << "thread_num " << thread_num_; - for (int thread_id = 0; thread_id < thread_num_; thread_id++) { + for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) { read_threads_.emplace_back(new std::thread(std::bind( &ReadThread, file_groups_[thread_id], data_desc_, static_cast(thread_id), &read_thread_status_, queue_))); diff --git a/paddle/fluid/operators/reader/py_reader.cc b/paddle/fluid/operators/reader/py_reader.cc new file mode 100644 index 0000000000000000000000000000000000000000..155ae859defcf20a5e226a4abfb99dc308dfb23c --- /dev/null +++ b/paddle/fluid/operators/reader/py_reader.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reader/py_reader.h" +#include + +namespace paddle { +namespace operators { +namespace reader { + +PyReader::PyReader(const std::shared_ptr& queue) + : framework::FileReader() { + PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); + queue_ = queue; +} + +void PyReader::ReadNext(std::vector* out) { + bool success; + *out = queue_->Pop(&success); + if (!success) out->clear(); +} + +PyReader::~PyReader() { queue_->Close(); } + +void PyReader::Shutdown() { queue_->Close(); } + +void PyReader::Start() { queue_->ReOpen(); } + +} // namespace reader +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reader/py_reader.h b/paddle/fluid/operators/reader/py_reader.h new file mode 100644 index 0000000000000000000000000000000000000000..43079075142e8db22c0e3b7c86de4249d447f961 --- /dev/null +++ b/paddle/fluid/operators/reader/py_reader.h @@ -0,0 +1,45 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" + +namespace paddle { +namespace operators { +namespace reader { + +class PyReader : public framework::FileReader { + public: + explicit PyReader(const std::shared_ptr& queue); + + void ReadNext(std::vector* out) override; + + ~PyReader(); + + void Shutdown() override; + + void Start() override; + + private: + std::shared_ptr queue_; +}; + +} // namespace reader +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc index 846b2ed77e46d82fbeda8faaeed99cddf23c8824..33a69ad5fec2b850cae070ca3f113f12c4e835f9 100644 --- a/paddle/fluid/operators/reader/read_op.cc +++ b/paddle/fluid/operators/reader/read_op.cc @@ -51,19 +51,16 @@ class ReadInferShape : public framework::InferShapeBase { class ReadInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { - bool infer_out = boost::get(op_desc.GetAttr("infer_out")); + void operator()(framework::InferVarTypeContext* ctx) const override { + bool infer_out = boost::get(ctx->GetAttr("infer_out")); if (infer_out) { - std::string reader_name = op_desc.Input("Reader")[0]; - std::vector out_names = op_desc.Output("Out"); - framework::VarDesc* reader = block->FindVarRecursive(reader_name); - auto dtypes = reader->GetDataTypes(); + std::string reader_name = ctx->Input("Reader")[0]; + std::vector out_names = ctx->Output("Out"); + auto dtypes = ctx->GetDataTypes(reader_name); PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); for (size_t i = 0; i < dtypes.size(); ++i) { - framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); - out.SetType(framework::proto::VarType::LOD_TENSOR); - out.SetDataType(dtypes[i]); + ctx->SetType(out_names[i], framework::proto::VarType::LOD_TENSOR); + ctx->SetDataType(out_names[i], dtypes[i]); } } } diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc index 3921eedf94abbe68bed035940913f830a6c16e48..64a1f6b68702f33ec72d901cf6621b674b331030 100644 --- a/paddle/fluid/operators/reader/reader_op_registry.cc +++ b/paddle/fluid/operators/reader/reader_op_registry.cc @@ -98,11 +98,10 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const { } } -void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const { - std::string reader_name = op_desc.Output("Out")[0]; - framework::VarDesc* reader = block->FindVarRecursive(reader_name); - reader->SetType(framework::proto::VarType::READER); +void FileReaderInferVarType::operator()( + framework::InferVarTypeContext* ctx) const { + std::string reader_name = ctx->Output("Out")[0]; + ctx->SetType(reader_name, framework::proto::VarType::READER); } void DecoratedReaderInferShape::operator()( @@ -125,13 +124,11 @@ void DecoratedReaderInferShape::operator()( } void DecoratedReaderInferVarType::operator()( - const framework::OpDesc& op_desc, framework::BlockDesc* block) const { - std::string in_reader_name = op_desc.Input("UnderlyingReader")[0]; - framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name); - std::string out_reader_name = op_desc.Output("Out")[0]; - framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name); - out_reader->SetType(framework::proto::VarType::READER); - out_reader->SetDataTypes(in_reader->GetDataTypes()); + framework::InferVarTypeContext* ctx) const { + const std::string& in_reader_name = ctx->Input("UnderlyingReader")[0]; + const std::string& out_reader_name = ctx->Output("Out")[0]; + ctx->SetType(out_reader_name, framework::proto::VarType::READER); + ctx->SetDataTypes(out_reader_name, ctx->GetDataTypes(in_reader_name)); } void DecoratedReaderMakerBase::Make() { diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h index 25c3e7d77b788d38daf6dee1fc79e5c1c97e8842..795a5806050efe6469732004125e4a80b08e5304 100644 --- a/paddle/fluid/operators/reader/reader_op_registry.h +++ b/paddle/fluid/operators/reader/reader_op_registry.h @@ -14,7 +14,9 @@ #pragma once +#include #include +#include #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" @@ -59,8 +61,7 @@ class FileReaderInferShape : public framework::InferShapeBase { class FileReaderInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override; + void operator()(framework::InferVarTypeContext* ctx) const override; }; // general infershape for decorated reader @@ -72,8 +73,7 @@ class DecoratedReaderInferShape : public framework::InferShapeBase { // general var type inference for decorated reader class DecoratedReaderInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override; + void operator()(framework::InferVarTypeContext* ctx) const override; }; class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index 88c968a0eaae8a2ac6f14ede9348c837bcd92d76..2898a62ddbac524ceb212cac5f34aeda3b1e01cb 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -282,7 +282,9 @@ class RecurrentOp : public RecurrentBase { // Every inputs are linked now, execute! executor.Run(*program, &cur_scope, block->ID(), - false /*create_local_scope*/); + false /*create_local_scope*/, true /*create_vars*/, + std::vector() /*skip_ref_cnt_vars*/, + true /*force_disable_gc*/); // get device context from pool platform::DeviceContextPool &pool = @@ -398,7 +400,9 @@ class RecurrentGradOp : public RecurrentBase { VLOG(5) << "Recurrent memory linking finished "; // Run step block with cur_scope executor.Run(*program, &cur_scope, block->ID(), - false /*create_local_scope*/); + false /*create_local_scope*/, true /*create_vars*/, + std::vector() /*skip_ref_cnt_vars*/, + true /*force_disable_gc*/); VLOG(5) << "executor.Run finished "; diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 37f69426b62fedf8cbeca68105fb86fb4ea72eab..5165af6a253e7f57c1e27cc017f2a0cbc1f70f38 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -219,14 +219,6 @@ class ReshapeKernel { std::vector(shape_data, shape_data + shape_tensor->numel()); out_dims = ReshapeOp::ValidateShape(shape, in->dims()); } - if (!in->lod().empty()) { - PADDLE_ENFORCE_EQ( - out_dims[0], in->dims()[0], - "Reshape operator cannot reshape an input sequence batch " - "into an output sequence batch that has a different " - "number of time steps. Please consider using " - "sequence_reshape op."); - } out->mutable_data(ctx.GetPlace(), in->type()); framework::TensorCopy( @@ -330,14 +322,10 @@ class Reshape2GradOp : public framework::OperatorWithKernel { } }; -class ReshapeOpInplaceInToOut : public framework::InplaceInToOut { +class ReshapeOpInplaceInToOut : public framework::InplaceOpInference { public: - using InplaceInToOut::InplaceInToOut; - - protected: - std::unordered_map Apply( - const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { + std::unordered_map operator()( + const framework::OpDesc &op_desc) const override { std::unordered_map inplace_in_to_out = { {"X", "Out"}, }; @@ -345,13 +333,10 @@ class ReshapeOpInplaceInToOut : public framework::InplaceInToOut { } }; -class ReshapeGradInplaceInToOut : public framework::InplaceInToOut { - using InplaceInToOut::InplaceInToOut; - - protected: - std::unordered_map Apply( - const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { +class ReshapeGradInplaceInToOut : public framework::InplaceOpInference { + public: + std::unordered_map operator()( + const framework::OpDesc &op_desc) const override { std::unordered_map inplace_in_to_out = { {framework::GradVarName("Out"), framework::GradVarName("X")}, }; diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc index d0edcc170f0afbccdcdf83eed9a167b7602e34ab..953e2655d13328b986a67398dca54f8a5e3aedcf 100644 --- a/paddle/fluid/operators/save_combine_op.cc +++ b/paddle/fluid/operators/save_combine_op.cc @@ -12,86 +12,33 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include -#include -#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/port.h" +#include + +#include "paddle/fluid/operators/save_combine_op.h" namespace paddle { namespace operators { -class SaveCombineOp : public framework::OperatorBase { +using Tensor = framework::Tensor; + +class SaveCombineOp : public framework::OperatorWithKernel { public: - SaveCombineOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto filename = Attr("file_path"); - auto overwrite = Attr("overwrite"); - auto save_as_fp16 = Attr("save_as_fp16"); - - bool is_present = FileExists(filename); - if (is_present && !overwrite) { - PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false", - filename, overwrite); - } - - MkDirRecursively(DirName(filename).c_str()); - std::ofstream fout(filename, std::ios::binary); - PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", - filename); - - auto inp_var_names = Inputs("X"); - PADDLE_ENFORCE_GT(static_cast(inp_var_names.size()), 0, - "The number of input variables should be greater than 0"); - - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - for (size_t i = 0; i < inp_var_names.size(); i++) { - auto *var = scope.FindVar(inp_var_names[i]); - - PADDLE_ENFORCE(var != nullptr, - "Cannot find variable %s for save_combine_op", - inp_var_names[i]); - PADDLE_ENFORCE(var->IsType(), - "SaveCombineOp only supports LoDTensor, %s has wrong type", - inp_var_names[i]); - - auto &tensor = var->Get(); - // Serialize tensors one by one - - // Check types to see if a fp16 transformation is required - auto in_dtype = tensor.type(); - auto out_dtype = - save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - - if (in_dtype != out_dtype) { - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor out; - // copy LoD info to the new tensor - out.set_lod(tensor.lod()); - framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); - framework::SerializeToStream(fout, out, dev_ctx); - } else { - framework::SerializeToStream(fout, tensor, dev_ctx); - } - } - fout.close(); + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override {} + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.GetPlace()); + } + // TODO(lujun): The override here is just to bypass transform + // in operator impl, which is not elegant enough. + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + return expected_kernel_type; } }; @@ -105,7 +52,7 @@ class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( SaveCombine operator -This operator will serialize and write a list of input LoDTensor variables +This operator will serialize and write a list of input LoDTensor variables to a file on disk. )DOC"); AddAttr("overwrite", @@ -123,7 +70,7 @@ to a file on disk. "(string)" "The \"file_path\" where the LoDTensor variables will be saved.") .AddCustomChecker( - [](const std::string &path) { return !path.empty(); }); + [](const std::string& path) { return !path.empty(); }); } }; @@ -134,3 +81,9 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(save_combine, ops::SaveCombineOp, ops::SaveCombineOpProtoMaker); + +REGISTER_OP_CPU_KERNEL( + save_combine, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel); diff --git a/paddle/fluid/operators/save_combine_op.cu b/paddle/fluid/operators/save_combine_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..78607823a0368d216310bbbb390fd7face002839 --- /dev/null +++ b/paddle/fluid/operators/save_combine_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/save_combine_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + save_combine, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel); diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h new file mode 100644 index 0000000000000000000000000000000000000000..4ee82e17dd5e8173ce7dfb5c248890912d2cc7ef --- /dev/null +++ b/paddle/fluid/operators/save_combine_op.h @@ -0,0 +1,95 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace operators { +template +class SaveCombineOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto place = ctx.GetPlace(); + auto filename = ctx.Attr("file_path"); + auto overwrite = ctx.Attr("overwrite"); + auto save_as_fp16 = ctx.Attr("save_as_fp16"); + + bool is_present = FileExists(filename); + if (is_present && !overwrite) { + PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false", + filename, overwrite); + } + + MkDirRecursively(DirName(filename).c_str()); + std::ofstream fout(filename, std::ios::binary); + PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", + filename); + + auto &inp_var_names = ctx.Inputs("X"); + auto &inp_vars = ctx.MultiInputVar("X"); + PADDLE_ENFORCE_GT(static_cast(inp_var_names.size()), 0, + "The number of input variables should be greater than 0"); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + for (size_t i = 0; i < inp_var_names.size(); i++) { + PADDLE_ENFORCE(inp_vars[i] != nullptr, + "Cannot find variable %s for save_combine_op", + inp_var_names[i]); + PADDLE_ENFORCE(inp_vars[i]->IsType(), + "SaveCombineOp only supports LoDTensor, %s has wrong type", + inp_var_names[i]); + + auto &tensor = inp_vars[i]->Get(); + // Serialize tensors one by one + + // Check types to see if a fp16 transformation is required + auto in_dtype = tensor.type(); + auto out_dtype = + save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + + if (in_dtype != out_dtype) { + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor out; + // copy LoD info to the new tensor + out.set_lod(tensor.lod()); + framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); + framework::SerializeToStream(fout, out, dev_ctx); + } else { + framework::SerializeToStream(fout, tensor, dev_ctx); + } + } + fout.close(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc index 4743e0d9499b111d8baa921dbb245431713fd7a8..5594de16b6789e99d5c4cc6828889eb0e311624a 100644 --- a/paddle/fluid/operators/save_load_combine_op_test.cc +++ b/paddle/fluid/operators/save_load_combine_op_test.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/float16.h" -USE_NO_KERNEL_OP(save_combine); -USE_NO_KERNEL_OP(load_combine); +USE_CPU_ONLY_OP(save_combine); +USE_CPU_ONLY_OP(load_combine); template T* CreateForSaveCombineOp(int x, int y, const std::vector& lod_info, diff --git a/paddle/fluid/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc index ccaea0eef2906953d922e097348b6c0a86dad6f1..d277198a2f92c426586e774873c6770b93660e85 100644 --- a/paddle/fluid/operators/save_load_op_test.cc +++ b/paddle/fluid/operators/save_load_op_test.cc @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/float16.h" -USE_NO_KERNEL_OP(save); -USE_NO_KERNEL_OP(load); +USE_CPU_ONLY_OP(save); +USE_CPU_ONLY_OP(load); TEST(SaveLoadOp, CPU) { paddle::framework::Scope scope; diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index fcc598f4f16138b4cc13c7b9bb59e79d80cf3596..338e2fbb5d868f146c9ff420b2d5d4cf6088316e 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -15,118 +15,24 @@ limitations under the License. */ #include #include #include +#include +#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/operators/save_op.h" namespace paddle { namespace operators { - -// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables -// to directory specified. -constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath"; - -class SaveOp : public framework::OperatorBase { +class SaveOp : public framework::OperatorWithKernel { public: - SaveOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto iname = Input("X"); - auto *var = scope.FindVar(iname); - PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op", - iname); - - if (var->IsType()) { - SaveLodTensor(place, var); - } else if (var->IsType()) { - SaveSelectedRows(scope, place, var); - } else { - PADDLE_ENFORCE( - false, - "SaveOp only support LoDTensor and SelectedRows, %s has wrong type", - iname); - } - } + using framework::OperatorWithKernel::OperatorWithKernel; - void SaveLodTensor(const platform::Place &place, - framework::Variable *var) const { - auto filename = Attr("file_path"); - auto overwrite = Attr("overwrite"); - - if (FileExists(filename) && !overwrite) { - PADDLE_THROW("%s is existed, cannot save to it when overwrite=false", - filename, overwrite); - } - - MkDirRecursively(DirName(filename).c_str()); - - auto &tensor = var->Get(); - - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - // FIXME(yuyang18): We save variable to local file now, but we should change - // it to save an output stream. - std::ofstream fout(filename, std::ios::binary); - PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", - filename); - - auto save_as_fp16 = Attr("save_as_fp16"); - auto in_dtype = tensor.type(); - auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - - if (in_dtype != out_dtype) { - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor out; - framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); - // copy LoD info to the new tensor - out.set_lod(tensor.lod()); - framework::SerializeToStream(fout, out, dev_ctx); - } else { - framework::SerializeToStream(fout, tensor, dev_ctx); - } - fout.close(); - } + void InferShape(framework::InferShapeContext *ctx) const override {} - void SaveSelectedRows(const framework::Scope &scope, - const platform::Place &place, - framework::Variable *var) const { - auto *lt_var = scope.FindVar(LOOKUP_TABLE_PATH)->GetMutable(); - PADDLE_ENFORCE( - lt_var != nullptr, - "Can not find variable kLookupTablePath for SaveSelectedRows"); - std::string filename = lt_var->data(); - VLOG(4) << "SaveSelectedRows get File name: " << filename; - - MkDirRecursively(DirName(filename).c_str()); - - auto &selectedRows = var->Get(); - - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - // FIXME(yuyang18): We save variable to local file now, but we should change - // it to save an output stream. - std::ofstream fout(filename, std::ios::binary); - PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", - filename); - framework::SerializeToStream(fout, selectedRows, dev_ctx); - fout.close(); + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); } }; @@ -154,17 +60,20 @@ This operator will serialize and write LoDTensor / SelectedRows variable to file "The \"file_path\" where the variable will be saved.") .AddCustomChecker( [](const std::string &path) { return !path.empty(); }); + AddOutput(LOOKUP_TABLE_PATH, + "(string)" + "for pserver: The \"kLookupTablePath\" where checkpoint notify " + "to save lookup table variables" + " to directory specified.") + .AsDispensable(); } }; class SaveOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - auto out_var_name = op_desc.Output(LOOKUP_TABLE_PATH).front(); - auto &out_var = block->FindRecursiveOrCreateVar(out_var_name); + void operator()(framework::InferVarTypeContext *ctx) const override { auto var_type = framework::proto::VarType::RAW; - out_var.SetType(var_type); + ctx->SetType(LOOKUP_TABLE_PATH, var_type); } }; @@ -172,11 +81,18 @@ class SaveOpShapeInference : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext *ctx) const override {} }; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(save, ops::SaveOp, paddle::framework::EmptyGradOpMaker, - ops::SaveOpProtoMaker, ops::SaveOpVarTypeInference, - ops::SaveOpShapeInference); +REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker, + ops::SaveOpVarTypeInference, ops::SaveOpShapeInference); + +REGISTER_OP_CPU_KERNEL( + save, ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel); diff --git a/paddle/fluid/operators/save_op.cu b/paddle/fluid/operators/save_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..0a778a694e52f146b6cceddb969b8af08f40ef9e --- /dev/null +++ b/paddle/fluid/operators/save_op.cu @@ -0,0 +1,27 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/save_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + save, ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel); diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h new file mode 100644 index 0000000000000000000000000000000000000000..642235aad58bef2ec7f741ee5fb5a65a2081f4ce --- /dev/null +++ b/paddle/fluid/operators/save_op.h @@ -0,0 +1,133 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace operators { +// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables +// to directory specified. +constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath"; +template +class SaveOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto place = ctx.GetPlace(); + + auto *input_var = ctx.InputVar("X"); + auto iname = ctx.Inputs("X").data(); + PADDLE_ENFORCE(input_var != nullptr, "Cannot find variable %s for save_op", + iname); + + if (input_var->IsType()) { + SaveLodTensor(ctx, place, input_var); + } else if (input_var->IsType()) { + SaveSelectedRows(ctx, place, input_var); + } else { + PADDLE_ENFORCE( + false, + "SaveOp only support LoDTensor and SelectedRows, %s has wrong type", + iname); + } + } + + void SaveLodTensor(const framework::ExecutionContext &ctx, + const platform::Place &place, + const framework::Variable *var) const { + auto filename = ctx.Attr("file_path"); + auto overwrite = ctx.Attr("overwrite"); + + if (FileExists(filename) && !overwrite) { + PADDLE_THROW("%s is existed, cannot save to it when overwrite=false", + filename, overwrite); + } + + MkDirRecursively(DirName(filename).c_str()); + + auto &tensor = var->Get(); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + // FIXME(yuyang18): We save variable to local file now, but we should change + // it to save an output stream. + std::ofstream fout(filename, std::ios::binary); + PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", + filename); + + auto save_as_fp16 = ctx.Attr("save_as_fp16"); + auto in_dtype = tensor.type(); + auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + + if (in_dtype != out_dtype) { + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor out; + framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); + // copy LoD info to the new tensor + out.set_lod(tensor.lod()); + framework::SerializeToStream(fout, out, dev_ctx); + } else { + framework::SerializeToStream(fout, tensor, dev_ctx); + } + fout.close(); + } + + void SaveSelectedRows(const framework::ExecutionContext &ctx, + const platform::Place &place, + const framework::Variable *var) const { + framework::Variable *out_put_var = ctx.OutputVar(LOOKUP_TABLE_PATH); + PADDLE_ENFORCE( + out_put_var != nullptr, + "Can not find variable kLookupTablePath for SaveSelectedRows"); + auto *lt_var = out_put_var->GetMutable(); + + std::string filename = lt_var->data(); + VLOG(4) << "SaveSelectedRows get File name: " << filename; + + MkDirRecursively(DirName(filename).c_str()); + + auto &selectedRows = var->Get(); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + // FIXME(yuyang18): We save variable to local file now, but we should change + // it to save an output stream. + std::ofstream fout(filename, std::ios::binary); + PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", + filename); + framework::SerializeToStream(fout, selectedRows, dev_ctx); + fout.close(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index 4ea77ed30db212b694f2050952655dd1a42215bd..4e4a015e18305cd7aad71722056b15216f44782e 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/scale_op.h" +#include #include #include "paddle/fluid/operators/detail/safe_ref.h" @@ -69,17 +70,13 @@ $$Out = scale*(X + bias)$$ class ScaleOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - auto &in_var_name = op_desc.Input("X").front(); - auto &in_var = detail::Ref(block->FindVarRecursive(in_var_name)); - - auto out_var_name = op_desc.Output("Out").front(); - auto *out_var = block->FindVarRecursive(out_var_name); + void operator()(framework::InferVarTypeContext *ctx) const override { + auto &in_var_name = ctx->Input("X").front(); + auto out_var_name = ctx->Output("Out").front(); if (in_var_name != out_var_name) { - out_var->SetType(in_var.GetType()); - out_var->SetDataType(in_var.GetDataType()); + ctx->SetType(out_var_name, ctx->GetType(in_var_name)); + ctx->SetDataType(out_var_name, ctx->GetDataType(in_var_name)); } } }; diff --git a/paddle/fluid/operators/selu_op.h b/paddle/fluid/operators/selu_op.h index bdb506885c932708803fe8d84ee705aee0fe02b4..b2fc834c42f65ff3521b6267ed2f32fabbab4e4d 100644 --- a/paddle/fluid/operators/selu_op.h +++ b/paddle/fluid/operators/selu_op.h @@ -15,13 +15,12 @@ limitations under the License. */ #pragma once #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math.h" #include "paddle/fluid/platform/for_range.h" + namespace paddle { namespace operators { -static HOSTDEVICE float real_exp(float x) { return expf(x); } -static HOSTDEVICE float real_exp(double x) { return exp(x); } - template struct SeluFunctor { SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr) diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc index d3dcd1f96a986d2450c8af780a12183f7dfc66d5..cc4eedbf4de2272caac75eb1e5a1d51feaf8cb38 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc @@ -22,9 +22,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (ctx->IsRuntime()) { - return; - } PADDLE_ENFORCE( ctx->HasInput("X"), "Input(X) of SequecceEnumerate operator should not be null."); @@ -33,13 +30,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel { "Output(X) of SequenceEnumerate operator should not be null."); const auto x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ( - x_dims.size(), 2, - "Input(X) of SequenceEnumerate operator's rank should be 2."); - PADDLE_ENFORCE_EQ(x_dims[1], 1, - "Input(X) of SequenceEnumerate operator's 2nd " - "dimension should be 1."); - const auto win_size = ctx->Attrs().Get("win_size"); ctx->SetOutputDim("Out", {x_dims[0], win_size}); ctx->ShareLoD("X", "Out"); @@ -62,6 +52,9 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker { }); AddAttr("pad_value", "(int) The enumerate sequence padding value.") .SetDefault(0); + AddAttr(framework::kAllKernelsMustComputeRuntimeShape, + "Skip calling InferShape() function in the runtime.") + .SetDefault(true); AddComment(R"DOC( Sequence Enumerate Operator. diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h index 18da69993b2ad5879dd4678ec0d4b06d7e30cb0a..6a1eb6e625b6990506ba554de4e2398daeb64451 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h @@ -27,30 +27,47 @@ class SequenceEnumerateKernel : public framework::OpKernel { auto* in = context.Input("X"); auto* out = context.Output("Out"); int win_size = context.Attr("win_size"); - int pad_value = context.Attr("pad_value"); + auto pad_value = static_cast(context.Attr("pad_value")); auto in_dims = in->dims(); - auto in_lod = in->lod(); - + auto lod0 = in->lod()[0]; PADDLE_ENFORCE_EQ( - static_cast(in_dims[0]), in_lod[0].back(), + static_cast(in_dims[0]), lod0.back(), "The actual input data's size mismatched with LoD information."); + PADDLE_ENFORCE_EQ( + in_dims.size(), 2UL, + "Input(X) of SequenceEnumerate operator's rank should be 2."); + PADDLE_ENFORCE_EQ(in_dims[1], 1, + "Input(X) of SequenceEnumerate operator's 2nd " + "dimension should be 1."); // Generate enumerate sequence set - auto lod0 = in_lod[0]; auto in_data = in->data(); out->Resize({in_dims[0], win_size}); + out->set_lod(in->lod()); auto out_data = out->mutable_data(context.GetPlace()); for (size_t i = 0; i < lod0.size() - 1; ++i) { - for (size_t idx = lod0[i]; idx < lod0[i + 1]; ++idx) { - for (int word_idx = 0; word_idx < win_size; ++word_idx) { - size_t word_pos = idx + word_idx; - out_data[win_size * idx + word_idx] = - word_pos < lod0[i + 1] ? in_data[word_pos] : pad_value; + int start = lod0[i]; + int end = lod0[i + 1]; + int copy_size = win_size < end - start + 1 ? win_size : end - start + 1; + int mid = end + 1 - copy_size; + int pad_num = win_size - copy_size; + copy_size *= sizeof(T); + for (int idx = start; idx < mid; ++idx) { + std::memcpy(out_data, in_data + idx, copy_size); + out_data += win_size; + } + for (int idx = mid; idx < end; ++idx) { + copy_size -= sizeof(T); + pad_num++; + std::memcpy(out_data, in_data + idx, copy_size); + T* pdata = out_data + copy_size / sizeof(T); + for (int i = 0; i < pad_num; ++i) { + pdata[i] = pad_value; } + out_data += win_size; } } - out->set_lod(in->lod()); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu index cc5e9821903fb7a726f52177df1d17757f697411..a9dc0a4fda253db9bb0d33c4a25fbba36492f35b 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include #include // NOLINT +#include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h" namespace paddle { @@ -21,9 +22,6 @@ namespace operators { using LoDTensor = framework::LoDTensor; -__device__ __forceinline__ float real_exp(float x) { return expf(x); } -__device__ __forceinline__ double real_exp(double x) { return exp(x); } - template using BlockReduce = cub::BlockReduce; diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu index 2a4570ef5cec0bee07efd69a2efd1a079ff33df5..aea69de6434a38aa834ff14f6d3d15ad5bbfc3e6 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "cub/cub.cuh" +#include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" #include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/hostdevice.h" @@ -21,11 +22,6 @@ namespace operators { using Tensor = framework::Tensor; -static HOSTDEVICE float real_exp(float x) { return expf(x); } -static HOSTDEVICE float real_exp(double x) { return exp(x); } -static HOSTDEVICE float real_log(float x) { return logf(x); } -static HOSTDEVICE float real_log(double x) { return log(x); } - static constexpr int kNumCUDAThreads = 512; static constexpr int kNumMaxinumNumBlocks = 4096; diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu index 5efecb78d1a4eaffc3a9c62e1e82a9bcb5922748..24a564f9ef9d6e7bdb80047d69b35a980a141bab 100644 --- a/paddle/fluid/operators/slice_op.cu +++ b/paddle/fluid/operators/slice_op.cu @@ -12,18 +12,138 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/slice_op.h" +#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +using platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void Padding(const paddle::platform::float16* d_out, + const int* out_dims, const int* in_dims, + const int* offsets, int64_t n, + paddle::platform::float16* d_in) { + int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x; + if (out_idx < n) { + int64_t out_idx_tmp = out_idx; + int coords[D] = {0}; + for (int i = D - 1; i >= 0; --i) { + coords[i] = out_idx_tmp % out_dims[i]; + out_idx_tmp /= out_dims[i]; + coords[i] += offsets[i]; + } + + int64_t in_idx = 0; + for (int i = 0; i < D; ++i) { + in_idx = in_idx * in_dims[i] + coords[i]; + } + + d_in[in_idx] = d_out[out_idx]; + } +} + +template <> +class SliceGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_in = ctx.Output(framework::GradVarName("Input")); + d_in->mutable_data(ctx.GetPlace()); + + auto out_dims = d_out->dims(); + auto in_dims = d_in->dims(); + int rank = out_dims.size(); + std::vector offsets(rank, 0); + auto axes = ctx.Attr>("axes"); + auto starts = ctx.Attr>("starts"); + + for (size_t i = 0; i < starts.size(); ++i) { + if (starts[i] < 0) { + starts[i] += in_dims[axes[i]]; + } + offsets[axes[i]] = std::max(starts[i], 0); + } + + math::SetConstant + set_zero; + auto& dev_ctx = + ctx.template device_context(); + set_zero(dev_ctx, d_in, static_cast(0)); + + int64_t numel = d_out->numel(); + dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1); + dim3 threads(PADDLE_CUDA_NUM_THREADS); + auto stream = ctx.cuda_device_context().stream(); + + auto out_shape = framework::vectorize2int(out_dims); + thrust::device_vector out_dims_vec(out_shape.begin(), out_shape.end()); + auto in_shape = framework::vectorize2int(in_dims); + thrust::device_vector in_dims_vec(in_shape.begin(), in_shape.end()); + thrust::device_vector offsets_vec(offsets.begin(), offsets.end()); + const int* out_dims_ptr = thrust::raw_pointer_cast(out_dims_vec.data()); + const int* in_dims_ptr = thrust::raw_pointer_cast(in_dims_vec.data()); + const int* offsets_ptr = thrust::raw_pointer_cast(offsets_vec.data()); + + switch (rank) { + case 1: + Padding<1><<>>( + d_out->data(), out_dims_ptr, in_dims_ptr, + offsets_ptr, numel, d_in->data()); + break; + case 2: + Padding<2><<>>( + d_out->data(), out_dims_ptr, in_dims_ptr, + offsets_ptr, numel, d_in->data()); + break; + case 3: + Padding<3><<>>( + d_out->data(), out_dims_ptr, in_dims_ptr, + offsets_ptr, numel, d_in->data()); + break; + case 4: + Padding<4><<>>( + d_out->data(), out_dims_ptr, in_dims_ptr, + offsets_ptr, numel, d_in->data()); + break; + case 5: + Padding<5><<>>( + d_out->data(), out_dims_ptr, in_dims_ptr, + offsets_ptr, numel, d_in->data()); + break; + case 6: + Padding<6><<>>( + d_out->data(), out_dims_ptr, in_dims_ptr, + offsets_ptr, numel, d_in->data()); + break; + } + } +}; + +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( slice, ops::SliceKernel, ops::SliceKernel, ops::SliceKernel, - ops::SliceKernel); + ops::SliceKernel, + ops::SliceKernel); REGISTER_OP_CUDA_KERNEL( slice_grad, ops::SliceGradKernel, ops::SliceGradKernel, ops::SliceGradKernel, - ops::SliceGradKernel); + ops::SliceGradKernel, + ops::SliceGradKernel); diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 8fbf299a7c056aff3bfd4cbd3e3cc28fd3c6ccf2..db44bd394a2ce280c06274f728dcf95d266f94cf 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/softmax_op.h" +#include #include +#include #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cudnn_helper.h" @@ -199,14 +201,10 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker { } }; -class SoftmaxInplaceInToOut : public framework::InplaceInToOut { +class SoftmaxInplaceInToOut : public framework::InplaceOpInference { public: - using framework::InplaceInToOut::InplaceInToOut; - - protected: - std::unordered_map Apply( - const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { + std::unordered_map operator()( + const framework::OpDesc& op_desc) const override { return std::unordered_map{ {"X", "Out"}, }; diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index 7754d2bfebdbc81e25432641b2eb4315386f75ff..fda971b20e27b68cab6110c323469f0d1c77cb59 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" +#include namespace paddle { namespace operators { @@ -187,7 +188,6 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker { grad_op->SetType("softmax_with_cross_entropy_grad"); grad_op->SetInput("Label", Input("Label")); grad_op->SetInput("Softmax", Output("Softmax")); - grad_op->SetInput("Loss", Output("Loss")); grad_op->SetInput(framework::GradVarName("Softmax"), OutputGrad("Softmax")); grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits")); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 52b8dcc681b1f97d5ba03697257509cae1e6b484..89aaac4cbe6399af08b3d340896df7a07e1be543 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -439,7 +439,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { context.Input(framework::GradVarName("Loss"))->data(); Tensor* logit_grad = context.Output(framework::GradVarName("Logits")); - logit_grad->ShareDataWith(*context.Input("Softmax")); + framework::TensorCopy(*context.Input("Softmax"), context.GetPlace(), + context.device_context(), logit_grad); T* logit_grad_data = logit_grad->data(); const int batch_size = logit_grad->dims()[0]; diff --git a/paddle/fluid/operators/spectral_norm_op.cc b/paddle/fluid/operators/spectral_norm_op.cc index 357d055756523cd83bf0e4b30719155b32c65974..04f659a465a345653d251cbe6703309c804fe614 100644 --- a/paddle/fluid/operators/spectral_norm_op.cc +++ b/paddle/fluid/operators/spectral_norm_op.cc @@ -10,6 +10,9 @@ limitations under the License. */ #include "paddle/fluid/operators/spectral_norm_op.h" + +#include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -156,6 +159,28 @@ class SpectralNormOpMaker : public framework::OpProtoAndCheckerMaker { } }; +class SpectralNormGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("spectral_norm_grad"); + + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("Weight", Input("Weight")); + op->SetInput("U", Input("U")); + op->SetInput("V", Input("V")); + + op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight")); + + op->SetAttrMap(Attrs()); + + return op; + } +}; + class SpectralNormOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -185,7 +210,7 @@ class SpectralNormOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(spectral_norm, ops::SpectralNormOp, ops::SpectralNormOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::SpectralNormGradOpDescMaker); REGISTER_OPERATOR(spectral_norm_grad, ops::SpectralNormOpGrad); REGISTER_OP_CPU_KERNEL( spectral_norm, diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc index 0e7b1463d1ba81aed53e0e3f3a90d2a1fbf0ffbc..88dfebc0cff0d0f7752c372780f1d952667ec630 100644 --- a/paddle/fluid/operators/split_selected_rows_op.cc +++ b/paddle/fluid/operators/split_selected_rows_op.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/split_selected_rows_op.h" +#include + namespace paddle { namespace operators { @@ -60,10 +62,9 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel { class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - for (auto &out_var : op_desc.Output("Out")) { - block->Var(out_var)->SetType(framework::proto::VarType::SELECTED_ROWS); + void operator()(framework::InferVarTypeContext *ctx) const override { + for (auto &out_var : ctx->Output("Out")) { + ctx->SetType(out_var, framework::proto::VarType::SELECTED_ROWS); } } }; diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc index e389c6a65e1e8220685294931c4d08e6fd928b7f..dc15df2c3c1b8a2964312d983be8ce362d3ab95d 100644 --- a/paddle/fluid/operators/squeeze_op.cc +++ b/paddle/fluid/operators/squeeze_op.cc @@ -40,7 +40,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase { "tensor's rank."); } - auto out_dims = GetOutputShape(axes, x_dims); + auto out_dims = GetOutputShape(axes, x_dims, false); ctx->SetOutputDim("Out", out_dims); if (x_dims[0] == out_dims[0]) { // Only pass LoD when the first dimension of output and Input(X) @@ -50,7 +50,8 @@ class SqueezeOpInferShape : public framework::InferShapeBase { } static framework::DDim GetOutputShape(const std::vector squeeze_dims, - const framework::DDim &in_dims) { + const framework::DDim &in_dims, + bool is_runtime) { size_t num_squeeze_dims = squeeze_dims.size(); int cnt_squeezed_dims = 0; bool should_squeeze[9] = {false}; @@ -71,9 +72,12 @@ class SqueezeOpInferShape : public framework::InferShapeBase { // Check current index, the upper limit has beed checked in line 36. PADDLE_ENFORCE(current >= 0, "Invalid axis, the negative axis is out of range."); - PADDLE_ENFORCE(in_dims[current] == 1, - "Invalid axis index, the axis that will be squeezed " - "should be equal to 1."); + + if (is_runtime) { + PADDLE_ENFORCE(in_dims[current] == 1, + "Invalid axis index, the axis that will be squeezed " + "should be equal to 1."); + } if (!(should_squeeze[current])) { ++cnt_squeezed_dims; @@ -94,6 +98,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase { } }; +// TODO(paddle-dev): Should use OpKernel. class SqueezeOp : public framework::OperatorBase { public: using OperatorBase::OperatorBase; @@ -103,7 +108,7 @@ class SqueezeOp : public framework::OperatorBase { const platform::Place &place) const override { auto &axes = Attr>("axes"); auto x_dims = scope.FindVar(Input("X"))->Get().dims(); - auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims); + auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims, true); framework::AttributeMap attrs; attrs["shape"] = framework::vectorize2int(out_dims); @@ -223,7 +228,7 @@ class Squeeze2Op : public framework::OperatorBase { const platform::Place &place) const override { auto &axes = Attr>("axes"); auto x_dims = scope.FindVar(Input("X"))->Get().dims(); - auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims); + auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims, true); framework::AttributeMap attrs; attrs["shape"] = framework::vectorize2int(out_dims); diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 7abfbbd3cb5e5374441c511d82663788c39c04c6..1391148ccf5d13082cb31ef2e143249e8ef95bfc 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -12,6 +12,7 @@ limitations under the License. */ #include "paddle/fluid/operators/sum_op.h" #include +#include #include #include @@ -159,24 +160,20 @@ the LoD information with the first input. class SumOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { - auto& inputs = op_desc.Input("X"); + void operator()(framework::InferVarTypeContext* ctx) const override { + auto& inputs = ctx->Input("X"); auto var_type = framework::proto::VarType::SELECTED_ROWS; - for (auto& name : op_desc.Input("X")) { - VLOG(10) << name << " " - << block->FindRecursiveOrCreateVar(name).GetType(); + for (auto& name : ctx->Input("X")) { + VLOG(10) << name << " " << ctx->GetType(name); } bool any_input_is_lod_tensor = std::any_of( - inputs.begin(), inputs.end(), [block](const std::string& name) { - return block->FindRecursiveOrCreateVar(name).GetType() == - framework::proto::VarType::LOD_TENSOR; + inputs.begin(), inputs.end(), [ctx](const std::string& name) { + return ctx->GetType(name) == framework::proto::VarType::LOD_TENSOR; }); - auto is_tensor_array = [block](const std::string& name) { - return block->FindRecursiveOrCreateVar(name).GetType() == - framework::proto::VarType::LOD_TENSOR_ARRAY; + auto is_tensor_array = [ctx](const std::string& name) { + return ctx->GetType(name) == framework::proto::VarType::LOD_TENSOR_ARRAY; }; bool any_input_is_tensor_array = @@ -188,8 +185,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference { if (!all_inputs_are_tensor_array) { std::ostringstream os; for (auto& each : inputs) { - os << " " << each << " type is " - << block->FindRecursiveOrCreateVar(each).GetType() << "\n"; + os << " " << each << " type is " << ctx->GetType(each) << "\n"; } PADDLE_ENFORCE(all_inputs_are_tensor_array, "Not all inputs are tensor array:\n%s", os.str()); @@ -199,11 +195,9 @@ class SumOpVarTypeInference : public framework::VarTypeInference { var_type = framework::proto::VarType::LOD_TENSOR; } - auto out_var_name = op_desc.Output("Out").front(); - auto& out_var = block->FindRecursiveOrCreateVar(out_var_name); - out_var.SetType(var_type); - auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front())); - out_var.SetDataType(in_var.GetDataType()); + auto out_var_name = ctx->Output("Out").front(); + ctx->SetType(out_var_name, var_type); + ctx->SetDataType(out_var_name, ctx->GetDataType(inputs.front())); } }; diff --git a/paddle/fluid/operators/sync_batch_norm_op.cc b/paddle/fluid/operators/sync_batch_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d6cf27fd779eeddc94c1839e46892a99f61bd1bf --- /dev/null +++ b/paddle/fluid/operators/sync_batch_norm_op.cc @@ -0,0 +1,20 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/batch_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sync_batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, + ops::BatchNormOpInferVarType, ops::BatchNormGradMaker); +REGISTER_OPERATOR(sync_batch_norm_grad, ops::BatchNormGradOp); diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..a5984bfaaaf96f7a412176bb9868dc44488acf3f --- /dev/null +++ b/paddle/fluid/operators/sync_batch_norm_op.cu @@ -0,0 +1,452 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "cub/cub.cuh" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/nccl_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DataLayout = framework::DataLayout; +template +using CudnnDataType = platform::CudnnDataType; + +template +__global__ void KeLocalStats(const T *x, int N, int M, int C, T *mean_var) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + for (int k = blockIdx.x; k < C; k += gridDim.x) { + T x_sum = 0; + T x2_sum = 0; + for (int i = threadIdx.x; i < N * M; i += BlockDim) { + int id = layout == framework::DataLayout::kNCHW + ? (i / M) * C * M + k * M + i % M + : i * C + k; + T x_in = x[id]; + x_sum += x_in; + x2_sum += x_in * x_in; + } + __syncthreads(); + T out = BlockReduce(temp_storage).Reduce(x_sum, cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + mean_var[k] = out / (N * M); + } + out = BlockReduce(temp_storage).Reduce(x2_sum, cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + mean_var[k + C] = out / (N * M); + } + } + if (blockIdx.x == 0 && threadIdx.x == 0) { + mean_var[2 * C] = static_cast(1.0); + } +} + +template +__global__ void KeSyncAndMovingStats(T *means, T *variances, T *num_dev, + const int C, const T momentum, + const double epsilon, T *sv_mean_data, + T *sv_inv_var_data, T *moving_means, + T *moving_variances) { + // sync stats across multi-devices + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < C; i += stride) { + T mean = means[i] / (*num_dev); + T var = variances[i] / (*num_dev); + var = var - mean * mean; + + // sync stats + sv_mean_data[i] = mean; + sv_inv_var_data[i] = 1.0 / sqrt(var + epsilon); + variances[i] = var; + + // moving stats + moving_means[i] = moving_means[i] * momentum + mean * (1. - momentum); + moving_variances[i] = + moving_variances[i] * momentum + var * (1. - momentum); + } +} + +template +static __global__ void KeNormAffine(const T *x, const T *scale, const T *bias, + const T *mean, const T *variance, + const double epsilon, const int C, + const int M, const int num, T *y) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < num; i += stride) { + const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C; + y[i] = (x[i] - mean[c]) / sqrt(variance[c] + epsilon) * scale[c] + bias[c]; + } +} + +template +class SyncBatchNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + double epsilon = static_cast(ctx.Attr("epsilon")); + const float momentum = ctx.Attr("momentum"); + const bool is_test = ctx.Attr("is_test"); + const std::string layout_str = ctx.Attr("data_layout"); + const DataLayout layout = framework::StringToDataLayout(layout_str); + const bool use_global_stats = ctx.Attr("use_global_stats"); + PADDLE_ENFORCE( + !use_global_stats, + "sync_batch_norm doesn't support to set use_global_stats True. ", + "Please use batch_norm in this case."); + + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); + int N, C, H, W, D; + ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); + int x_numel = x->numel(); + + const T *x_d = x->data(); + const T *s_d = ctx.Input("Scale")->data(); + const T *b_d = ctx.Input("Bias")->data(); + + auto *y = ctx.Output("Y"); + T *y_d = y->mutable_data(ctx.GetPlace()); + + const T *mean_data = nullptr; + const T *var_data = nullptr; + + auto &dev_ctx = ctx.cuda_device_context(); + auto stream = dev_ctx.stream(); + auto *comm = dev_ctx.nccl_comm(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + + paddle::memory::AllocationPtr alloc_ptr{nullptr}; + + if (is_test) { + const auto *est_mean = ctx.Input("Mean"); + const auto *est_var = ctx.Input("Variance"); + mean_data = est_mean->data(); + var_data = est_var->data(); + } else { + auto &allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + // x, x^2, 1, here 1 is used to calc device num + // device num also can be got from platform::DeviceContextPool + const int bytes = (C * 2 + 1) * sizeof(T); + alloc_ptr = allocator.Allocate(bytes); + + T *stats = reinterpret_cast(alloc_ptr->ptr()); + const int threads = 256; + int grid = std::min(C, (max_threads + threads - 1) / threads); + if (layout == framework::DataLayout::kNCHW) { + KeLocalStats< + T, threads, + framework::DataLayout::kNCHW><<>>( + x_d, N, H * W * D, C, stats); + } else { + KeLocalStats< + T, threads, + framework::DataLayout::kNHWC><<>>( + x_d, N, H * W * D, C, stats); + } + + Tensor c_g_st; + T *c_g_st_d = c_g_st.mutable_data({2 * C + 1}, platform::CPUPlace()); + auto gplace = boost::get(ctx.GetPlace()); + memory::Copy(platform::CPUPlace(), c_g_st_d, gplace, stats, bytes, 0); + + int dtype = platform::ToNCCLDataType(x->type()); + // In-place operation + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + stats, stats, 2 * C + 1, static_cast(dtype), ncclSum, + comm, stream)); + + // moving mean/variance + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + T *est_mean_data = mean_out->mutable_data(ctx.GetPlace()); + T *est_var_data = variance_out->mutable_data(ctx.GetPlace()); + + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_inv_variance = ctx.Output("SavedVariance"); + T *sv_mean_data = saved_mean->mutable_data(ctx.GetPlace()); + T *sv_inv_var_data = saved_inv_variance->mutable_data(ctx.GetPlace()); + + // Note, Input('Mean')/Input('Variance') share variable with + // Output('MeanOut')/Output('VarianceOut') + KeSyncAndMovingStats<<<(C + block - 1) / block, block, 0, stream>>>( + stats, stats + C, stats + 2 * C, C, momentum, epsilon, sv_mean_data, + sv_inv_var_data, est_mean_data, est_var_data); + + mean_data = sv_mean_data; + var_data = stats + C; + } + + int grid2 = (std::min(x_numel, max_threads) + block - 1) / block; + if (layout == framework::DataLayout::kNCHW) { + KeNormAffine<<>>( + x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel, + y_d); + } else { + KeNormAffine<<>>( + x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel, + y_d); + } + } +}; + +template +__global__ void KeBackwardLocalStats(const T *dy, const T *x, const T *means, + int N, int M, int C, T *sum_dy_prod) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + for (int k = blockIdx.x; k < C; k += gridDim.x) { + T sum1 = 0; + T sum2 = 0; + T mean = means[k]; + for (int i = threadIdx.x; i < N * M; i += blockDim.x) { + int id = layout == framework::DataLayout::kNCHW + ? (i / M) * C * M + k * M + i % M + : i * C + k; + T g = dy[id]; + sum1 += g; + sum2 += g * (x[id] - mean); + } + + __syncthreads(); + T out = BlockReduce(temp_storage).Reduce(sum1, cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + sum_dy_prod[k] = out; + } + out = BlockReduce(temp_storage).Reduce(sum2, cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + sum_dy_prod[k + C] = out; + } + } + if (blockIdx.x == 0 && threadIdx.x == 0) { + sum_dy_prod[2 * C] = static_cast(1.0); + } +} + +template +static __global__ void KeBNBackwardScaleBias(const T *dy, const T *x, + const T *mean, + const T *inv_variance, + const double epsilon, const int N, + const int C, const int HxW, + T *dscale, T *dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + T ds_sum = static_cast(0); + T db_sum = static_cast(0); + + T inv_var_i = inv_variance[i]; + T mean_i = mean[i]; + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int id = layout == framework::DataLayout::kNCHW + ? ((j / HxW) * C + i) * HxW + (j % HxW) + : j * outer_size + i; + ds_sum += dy[id] * (x[id] - mean_i); + db_sum += dy[id]; + } + __syncthreads(); + double os = BlockReduce(temp_storage) + .Reduce(static_cast(ds_sum), cub::Sum()); + __syncthreads(); + double ob = BlockReduce(temp_storage) + .Reduce(static_cast(db_sum), cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + dscale[i] = static_cast(os * inv_var_i); + dbias[i] = static_cast(ob); + } + __syncthreads(); + } +} + +template +static __global__ void KeBNBackwardData(const T *dy, const T *x, const T *beta, + const T *mean, const T *inv_variance, + const T *g_sum_dy, + const T *g_sum_dy_prod, + const T *num_dev, const double epsilon, + const int C, const int HxW, + const int num, T *dx) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + T scale = static_cast(C) / num; + T dev_num = num_dev[0]; + for (int i = gid; i < num; i += stride) { + const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; + T inv_var = inv_variance[c]; + T s_d = beta[c]; + T gvar = -1.0 * (g_sum_dy_prod[c] / dev_num) * s_d * inv_var * + (inv_var * inv_var); + T gmean = -1.0 * (g_sum_dy[c] / dev_num) * s_d * inv_var; + + dx[i] = + dy[i] * s_d * inv_var + gmean * scale + gvar * scale * (x[i] - mean[c]); + } +} + +// Deriving the Gradient for the Backward Pass of Batch Normalization +// https://kevinzakka.github.io/2016/09/14/batch_normalization/ +template +class SyncBatchNormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + double epsilon = static_cast(ctx.Attr("epsilon")); + const std::string layout_str = ctx.Attr("data_layout"); + + const DataLayout layout = framework::StringToDataLayout(layout_str); + const auto *x = ctx.Input("X"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + const auto *scale = ctx.Input("Scale"); + + const auto &x_dims = x->dims(); + + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); + int N, C, H, W, D; + ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + d_x->mutable_data(ctx.GetPlace()); + if (d_scale && d_bias) { + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); + } + PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL); + PADDLE_ENFORCE_EQ(scale->dims()[0], C); + + std::vector dims; + std::vector strides; + if (layout == DataLayout::kNCHW) { + dims = {N, C, H, W, D}; + strides = {C * H * W * D, H * W * D, W * D, D, 1}; + } else { + dims = {N, C, H, W, D}; + strides = {H * W * C * D, 1, W * D * C, D * C, C}; + } + + const T *x_d = x->data(); + const T *dy_d = d_y->data(); + + auto &dev_ctx = ctx.cuda_device_context(); + auto stream = dev_ctx.stream(); + auto *comm = dev_ctx.nccl_comm(); + + const T *saved_mean = ctx.Input("SavedMean")->data(); + const T *saved_inv_var = ctx.Input("SavedVariance")->data(); + auto &allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + const int bytes = (C * 2 + 1) * sizeof(T); + auto alloc_ptr = allocator.Allocate(bytes); + T *stats = reinterpret_cast(alloc_ptr->ptr()); + + const int threads = 256; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + int grid = std::min(C, (max_threads + threads - 1) / threads); + int x_numel = x->numel(); + int fsize = H * W * D; + + if (layout == framework::DataLayout::kNCHW) { + KeBackwardLocalStats< + T, threads, + framework::DataLayout::kNCHW><<>>( + dy_d, x_d, saved_mean, N, fsize, C, stats); + } else { + KeBackwardLocalStats< + T, threads, + framework::DataLayout::kNHWC><<>>( + dy_d, x_d, saved_mean, N, fsize, C, stats); + } + int dtype = platform::ToNCCLDataType(x->type()); + // In-place operation + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + stats, stats, 2 * C + 1, static_cast(dtype), ncclSum, + comm, stream)); + + const int block = 512; + int grid2 = (std::min(x_numel, max_threads) + block - 1) / block; + if (layout == framework::DataLayout::kNCHW) { + if (d_scale && d_bias) { + KeBNBackwardScaleBias< + T, threads, + framework::DataLayout::kNCHW><<>>( + dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize, + d_scale->data(), d_bias->data()); + } + if (d_x) { + KeBNBackwardData< + T, framework::DataLayout::kNCHW><<>>( + dy_d, x_d, scale->data(), saved_mean, saved_inv_var, stats, + stats + C, stats + 2 * C, epsilon, C, fsize, x->numel(), + d_x->data()); + } + } else { + if (d_scale && d_bias) { + KeBNBackwardScaleBias< + T, threads, + framework::DataLayout::kNHWC><<>>( + dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize, + d_scale->data(), d_bias->data()); + } + if (d_x) { + KeBNBackwardData< + T, framework::DataLayout::kNHWC><<>>( + dy_d, x_d, scale->data(), saved_mean, saved_inv_var, stats, + stats + C, stats + 2 * C, epsilon, C, fsize, x->numel(), + d_x->data()); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL( + sync_batch_norm, ops::SyncBatchNormKernel, + ops::SyncBatchNormKernel); +REGISTER_OP_CUDA_KERNEL( + sync_batch_norm_grad, + ops::SyncBatchNormGradKernel, + ops::SyncBatchNormGradKernel); diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc index 58a74ec2c104f66e9e884cffd00e7fa6622e4714..2b83c42f205c6ec0c14305586e179a003ce2619f 100644 --- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc +++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc @@ -177,10 +177,9 @@ class LoDTensorArray2TensorGradInferShape : public framework::InferShapeBase { class LoDTensorArray2TensorGradInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - for (auto &out_var : op_desc.Output(framework::GradVarName("X"))) { - block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY); + void operator()(framework::InferVarTypeContext *ctx) const override { + for (auto &out_var : ctx->Output(framework::GradVarName("X"))) { + ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY); } } }; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index 031335009b692f9d1f73070c88e8e79d852cbe36..6cf3e65e00ff6dd6a87d2b699ae89b9bde5d5462 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -30,6 +30,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Ys", "A list of outputs").AsDuplicable(); AddAttr("subgraph", "the subgraph."); AddAttr("calibration_data", "the calibration data for int8"); + AddAttr( + "engine_serialized_data", + "the serialized data contains the all info of the ICUDAEngine"); AddAttr( "engine_key", "The engine_key here is used to distinguish different TRT Engines"); @@ -43,8 +46,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { class TensorRTEngineInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override {} + void operator()(framework::InferVarTypeContext *ctx) const override {} }; } // namespace operators diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 2ff35c7c6ac6409d529de5b794bfc322b1f5dd9b..c36673312489738ad0475a0b70a23a1c6c948b9d 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -16,8 +16,10 @@ #ifdef PADDLE_WITH_CUDA +#include #include #include +#include #include #include "paddle/fluid/framework/executor.h" @@ -31,37 +33,6 @@ namespace paddle { namespace operators { -using FluidDT = framework::proto::VarType_Type; -using TRT_DT = nvinfer1::DataType; - -namespace { // NOLINT - -TRT_DT FluidDataType2TRT(FluidDT type) { - switch (type) { - case FluidDT::VarType_Type_FP32: - return TRT_DT::kFLOAT; - case FluidDT::VarType_Type_INT32: - return TRT_DT::kINT32; - default: - return TRT_DT::kINT32; - } - PADDLE_THROW("unkown type"); - return TRT_DT::kINT32; -} - -nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { - PADDLE_ENFORCE_GT(shape.size(), 1UL, - "TensorRT' tensor input requires at least 2 dimensions"); - PADDLE_ENFORCE_LE(shape.size(), 4UL, - "TensorRT' tensor input requires at most 4 dimensions"); - PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL); - if (shape.size() == 4UL) - return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]); - return nvinfer1::DimsCHW(shape[1], 1, 1); -} - -} // namespace // NOLINT - using inference::Singleton; using inference::tensorrt::TensorRTEngine; using inference::tensorrt::TRTInt8Calibrator; @@ -79,6 +50,7 @@ class TensorRTEngineOp : public framework::OperatorBase { bool enable_int8_; std::string calibration_data_; std::string engine_key_; + std::string engine_serialized_data_; bool calibration_mode_; public: @@ -93,6 +65,7 @@ class TensorRTEngineOp : public framework::OperatorBase { enable_int8_ = Attr("enable_int8"); calibration_data_ = Attr("calibration_data"); engine_key_ = Attr("engine_key"); + engine_serialized_data_ = Attr("engine_serialized_data"); auto params = Attr>("parameters"); for (const auto ¶m : params) { @@ -125,7 +98,8 @@ class TensorRTEngineOp : public framework::OperatorBase { RunCalibration(scope, dev_place); return; } - RunTrt(scope, dev_place); + auto *trt_engine = GetEngine(scope, dev_place); + RunTrt(scope, dev_place, trt_engine); } void RunCalibration(const framework::Scope &scope, @@ -136,10 +110,6 @@ class TensorRTEngineOp : public framework::OperatorBase { LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_ << " is running calibration trt int8... "; int runtime_batch = 1; - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(dev_place); - auto stream = - reinterpret_cast(dev_ctx).stream(); if (!Singleton::Global().Has(engine_key_)) { TRTCalibratorEngine *calib_res = Singleton::Global().Create(engine_key_); @@ -156,11 +126,11 @@ class TensorRTEngineOp : public framework::OperatorBase { calib_buffers, runtime_batch, engine_key_, dev_place)); calib_res->thr_.reset(new std::thread([&]() { calib_res->engine_.reset(new TensorRTEngine( - max_batch_size_, workspace_size_, stream, - boost::get(dev_place).device, enable_int8_, - calib_res->calib_.get())); + max_batch_size_, workspace_size_, enable_int8_, + calib_res->calib_.get(), + boost::get(dev_place).device)); VLOG(3) << "start the calib trt engine thread"; - Prepare(scope, dev_place, calib_res->engine_.get()); + PrepareTRTEngine(scope, calib_res->engine_.get()); })); } @@ -180,28 +150,29 @@ class TensorRTEngineOp : public framework::OperatorBase { RunNativeImpl(scope, dev_place); } - void RunTrt(const framework::Scope &scope, - const platform::Place &dev_place) const { + void RunTrt(const framework::Scope &scope, const platform::Place &dev_place, + TensorRTEngine *engine) const { int runtime_batch = 1; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(dev_place); auto stream = reinterpret_cast(dev_ctx).stream(); - if (trt_engine_.get() == nullptr) { - trt_engine_.reset( - new TensorRTEngine(max_batch_size_, workspace_size_, stream, - boost::get(dev_place).device, - enable_int8_, calibrator_.get())); - Prepare(scope, dev_place, trt_engine_.get()); - } - auto *engine = trt_engine_.get(); PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs"); std::vector output_maps = Attr>("output_name_mapping"); - // Convert input tensor from fluid to engine. + int num_inputs = 0; + + for (const auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; + num_inputs += 1; + } + const int num_bindings = num_inputs + Outputs("Ys").size(); + std::vector buffers(num_bindings); + + // Bind input tensor to TRT. for (const auto &x : Inputs("Xs")) { if (param_names_.count(x)) continue; // convert input and copy to TRT engine's buffer @@ -209,28 +180,20 @@ class TensorRTEngineOp : public framework::OperatorBase { inference::analysis::GetFromScope(scope, x); auto t_shape = framework::vectorize(t.dims()); runtime_batch = t_shape[0]; - if (platform::is_cpu_place(t.place())) { - engine->SetInputFromCPU(x, static_cast(t.data()), - t.memory_size()); - } else { - engine->SetInputFromGPU(x, static_cast(t.data()), - t.memory_size()); - } - } - cudaStreamSynchronize(stream); - PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_); - // Execute the engine. - engine->Execute(runtime_batch); + const int bind_index = engine->engine()->getBindingIndex(x.c_str()); + PADDLE_ENFORCE(bind_index < num_bindings, + "The bind index should be less than num_bindings"); + buffers[bind_index] = static_cast(t.data()); + } - // Convert output tensor from engine to fluid + // Bind output tensor to TRT. int output_index = 0; VLOG(4) << "TensorRT Engine Op Outputs:"; for (const auto &y : Outputs("Ys")) { - VLOG(4) << y; - // convert output and copy to fluid. - nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]); - auto dims = trt_t->getDimensions(); + const int bind_index = + engine->engine()->getBindingIndex(output_maps[output_index].c_str()); + auto dims = engine->engine()->getBindingDimensions(bind_index); // Use the output ITensor's dims to reshape the Fluid Tensor. // The ITensor doesn't contain the batch size dim. std::vector ddim; @@ -238,71 +201,55 @@ class TensorRTEngineOp : public framework::OperatorBase { for (int i = 0; i < dims.nbDims; i++) { ddim.push_back(dims.d[i]); } - auto *fluid_v = scope.FindVar(y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); auto *fluid_t = fluid_v->GetMutable(); - fluid_t->Resize(framework::make_ddim(ddim)); - // TODO(Superjomn) change this float to dtype size. - auto size = - inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch; - engine->GetOutputInGPU( - output_maps[output_index], - fluid_t->mutable_data(platform::CUDAPlace( - boost::get(dev_place).device)), - size * sizeof(float)); + PADDLE_ENFORCE(bind_index < num_bindings, + "The bind index should be less than num_bindings"); + buffers[bind_index] = static_cast(fluid_t->mutable_data( + boost::get(dev_place))); + output_index += 1; } + PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_); + // Execute the engine. + engine->Execute(runtime_batch, &buffers, stream); cudaStreamSynchronize(stream); } - void Prepare(const framework::Scope &scope, const platform::Place &dev_place, - TensorRTEngine *engine) const { + TensorRTEngine *GetEngine(const framework::Scope &scope, + const platform::Place &dev_place) const { + if (!trt_engine_) { + trt_engine_.reset(new inference::tensorrt::TensorRTEngine( + max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(), + boost::get(dev_place).device)); + if (!engine_serialized_data_.empty()) { + trt_engine_->Deserialize(engine_serialized_data_); + } else { + PrepareTRTEngine(scope, trt_engine_.get()); + } + } + return trt_engine_.get(); + } + + void PrepareTRTEngine(const framework::Scope &scope, + TensorRTEngine *engine) const { LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " "kernel etc). This process may cost a lot of time."; - framework::proto::BlockDesc block_desc; - block_desc.ParseFromString(Attr("subgraph")); + framework::proto::BlockDesc block_proto; + block_proto.ParseFromString(Attr("subgraph")); + framework::BlockDesc block_desc(nullptr, &block_proto); - std::vector output_maps = + std::vector inputs = Inputs("Xs"); + std::vector outputs = Attr>("output_name_mapping"); - engine->InitNetwork(); - - framework::BlockDesc block(nullptr /*programdesc*/, &block_desc); - VLOG(4) << "parsed var size " << block.AllVars().size(); - // Add inputs - VLOG(4) << "declare inputs"; - for (auto &input : Inputs("Xs")) { - if (param_names_.count(input)) continue; - VLOG(4) << "declare input " << input; - - auto &t = - inference::analysis::GetFromScope(scope, input); - auto t_shape = framework::vectorize(t.dims()); - - auto *var = block.FindVar(input); - // TensorRT engine need to create parameters. The parameter's description - // should be set in - PADDLE_ENFORCE(var, "no variable called %s", input); - PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, - "TensorRT engine only takes LoDTensor as input"); - - engine->DeclareInput( - input, FluidDataType2TRT( - var->Proto()->type().lod_tensor().tensor().data_type()), - Vec2TRT_Dims(t_shape)); - } inference::Singleton::Global() - .ConvertBlock(block_desc, param_names_, scope, engine); - - // Add outputs - for (auto &output : output_maps) { - engine->DeclareOutput(output); - } - engine->FreezeNetwork(); + .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_, + outputs, engine); } }; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 5a3d9d2c1a3e8111acbad2ddcf4f5469a3a99751..e7ad2f4fe0c654d8928f5793c1ad8052ab766fb5 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -107,6 +107,7 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetAttr("output_name_mapping", std::vector({"z0"})); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); + engine_op_desc.SetAttr("engine_serialized_data", std::string("")); LOG(INFO) << "create engine op"; auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); @@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetAttr("output_name_mapping", std::vector({"z3"})); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); + engine_op_desc.SetAttr("engine_serialized_data", std::string("")); auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc index 9e77f7252de1545e04bd2feaff27374c189dfc48..db763a051d1e08b962a40913d290c69e7c61ec32 100644 --- a/paddle/fluid/operators/top_k_op.cc +++ b/paddle/fluid/operators/top_k_op.cc @@ -34,8 +34,11 @@ class TopkOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_GE(k, 1, "k must >= 1"); PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape"); - PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k, - "input must have >= k columns"); + + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k, + "input must have >= k columns"); + } framework::DDim dims = input_dims; dims[dims.size() - 1] = k; diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index e3132ae76f624f3338d749e4fcebbd0ecd7ffe79..bb6a1c5b165693df4199fe0794daffc2cff789a4 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -112,17 +112,16 @@ uniform distribution. The random result is in set [min, max]. class UniformRandomOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - auto out_var_name = op_desc.Output("Out").front(); + void operator()(framework::InferVarTypeContext *ctx) const override { + auto out_var_name = ctx->Output("Out").front(); auto var_data_type = static_cast( - boost::get(op_desc.GetAttr("dtype"))); + boost::get(ctx->GetAttr("dtype"))); - auto out_var = block->FindRecursiveOrCreateVar(out_var_name); - if (out_var.GetType() != framework::proto::VarType::SELECTED_ROWS) { - out_var.SetType(framework::proto::VarType::LOD_TENSOR); + if (ctx->GetType(out_var_name) != + framework::proto::VarType::SELECTED_ROWS) { + ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR); } - out_var.SetDataType(var_data_type); + ctx->SetDataType(out_var_name, var_data_type); } }; diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 9220d35707b286d76ab4824e3f1080453f60bfe6..c3db59563f3ae77acd860216b34d2cfb4f8b6560 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -46,8 +46,9 @@ cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper) IF(WITH_GPU) set(GPU_CTX_DEPS dynload_cuda dynamic_loader) + set(dgc_deps dgc) ELSE() - set(GPU_CTX_DEPS) + set(dgc_deps) ENDIF() IF(WITH_MKLDNN) @@ -68,7 +69,8 @@ ENDIF() # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS} - place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} temp_allocator) + place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} + temp_allocator ${dgc_deps}) if(WIN32) if(WITH_GPU AND NOT WITH_DSO) diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h index 2e8fa7c1b8f7f7b8f3154aae691bb100375981dd..497c7b3c87f94c19b4bf1ded33927a353ee1ab84 100644 --- a/paddle/fluid/platform/assert.h +++ b/paddle/fluid/platform/assert.h @@ -37,13 +37,13 @@ limitations under the License. */ } \ } while (0) -#define PADDLE_ASSERT_MSG_CODE(e, m, c) \ - do { \ - if (!(e)) { \ - printf("%s:%d Assertion `%s` failed (%s %d).\n", __FILE__, __LINE__, \ - TOSTRING(e), m, c); \ - asm("trap;"); \ - } \ +#define PADDLE_ASSERT_MSG_CODE(e, m, c) \ + do { \ + if (!(e)) { \ + printf("%s:%d Assertion `%s` failed (%s %ld).\n", __FILE__, __LINE__, \ + TOSTRING(e), m, c); \ + asm("trap;"); \ + } \ } while (0) #else #include diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 920b43b2b1990af58b73888bf7a652d57c20563c..61386bdf05ab4a5b11d94c942c4476abd8698714 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -21,6 +21,8 @@ limitations under the License. */ #include "paddle/fluid/platform/cuda_device_guard.h" #endif +#include "glog/logging.h" + namespace paddle { namespace platform { @@ -57,7 +59,6 @@ DeviceContextPool::DeviceContextPool( for (auto& p : places) { set.insert(p); } - for (auto& p : set) { if (platform::is_cpu_place(p)) { #ifdef PADDLE_WITH_MKLDNN @@ -213,6 +214,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { CudnnHolder::CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place) : workspace_(nullptr), stream_(stream), place_(place) { + PADDLE_ENFORCE(cudaSetDevice(place_.device)); PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_)); } @@ -253,10 +255,6 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) #endif } - if (dynload::HasCUDNN()) { - cudnn_holder_.reset(new CudnnHolder(&stream_, place)); - } - driver_version_ = GetCUDADriverVersion(place_.device); runtime_version_ = GetCUDARuntimeVersion(place_.device); @@ -317,6 +315,9 @@ CUDADeviceContext::~CUDADeviceContext() { eigen_stream_.reset(); eigen_device_.reset(); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); +#if !defined(_WIN32) + PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_)); +#endif } Place CUDADeviceContext::GetPlace() const { return place_; } @@ -325,8 +326,17 @@ void CUDADeviceContext::Wait() const { auto& allocator = DeviceTemporaryAllocator::Instance().Get(*this); allocator.Release([this]() { - PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); - PADDLE_ENFORCE(cudaGetLastError()); + cudaError_t e_sync = cudaStreamSynchronize(stream_); + if (e_sync != 0) { + LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync) + << " errno:" << e_sync; + } + + cudaError_t e_get = cudaGetLastError(); + if (e_get != 0) { + LOG(FATAL) << "cudaGetLastError " << cudaGetErrorString(e_get) + << " errno:" << e_get; + } }); } @@ -346,12 +356,21 @@ bool CUDADeviceContext::tensor_core_available() const { return cublas_tensor_core_handle_ != nullptr; } +CudnnHolder* CUDADeviceContext::cudnn_holder() const { + std::call_once(init_cudnn_, [&]() { + if (dynload::HasCUDNN()) { + cudnn_holder_.reset(new CudnnHolder(&stream_, place_)); + } + }); + return cudnn_holder_.get(); +} + cudnnHandle_t CUDADeviceContext::cudnn_handle() const { - return cudnn_holder_->cudnn_handle(); + return cudnn_holder()->cudnn_handle(); } CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const { - return CudnnWorkspaceHandle(cudnn_holder_.get()); + return CudnnWorkspaceHandle(cudnn_holder()); } cudaStream_t CUDADeviceContext::stream() const { return stream_; } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index d376f90ad5754d70f3b9f30957eb2e2f584f8da9..778f6613bd49dfbc46e8888cd53b1a4de5fe923d 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -23,6 +23,9 @@ limitations under the License. */ #include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cudnn.h" +#if !defined(__APPLE__) && !defined(_WIN32) +#include "paddle/fluid/platform/dynload/nccl.h" +#endif #include "paddle/fluid/platform/gpu_info.h" #endif @@ -265,6 +268,14 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return cuda stream in the device context. */ cudaStream_t stream() const; +#if !defined(_WIN32) + /*! \brief Return nccl communicators. */ + ncclComm_t nccl_comm() const { return nccl_comm_; } + + /*! \brief Set nccl communicators. */ + void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; } +#endif + template void RecordEvent(cudaEvent_t ev, Callback callback) { callback(); @@ -281,14 +292,25 @@ class CUDADeviceContext : public DeviceContext { private: CUDAPlace place_; + mutable std::once_flag init_cudnn_; + std::unique_ptr eigen_device_; std::unique_ptr eigen_stream_; - std::unique_ptr cudnn_holder_; + mutable std::unique_ptr cudnn_holder_; cudaStream_t stream_; std::unique_ptr cublas_handle_; std::unique_ptr cublas_tensor_core_handle_; +#if !defined(_WIN32) + // NCCL communicator (single process version) for NCCL collective operations. + // NCCL collective operations provides fast collectives over multiple GPUs + // both within and across nodes. + // But, this collectives is used for collectives over multiple GPUs within + // nodes. + ncclComm_t nccl_comm_{nullptr}; +#endif + int compute_capability_; int runtime_version_; int driver_version_; @@ -297,6 +319,7 @@ class CUDADeviceContext : public DeviceContext { // StreamCallbackManager is thread-safe std::unique_ptr callback_manager_; + CudnnHolder* cudnn_holder() const; DISABLE_COPY_AND_ASSIGN(CUDADeviceContext); }; diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 0179daa55715be9787bc7cc8a693319024d404b7..8458b17f82a976bad37df58dddc2c0d80c8eb13e 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/device_tracer.h" #include #include @@ -30,6 +29,7 @@ limitations under the License. */ #include "glog/logging.h" #include "google/protobuf/text_format.h" #include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" @@ -222,19 +222,24 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, } case CUPTI_ACTIVITY_KIND_DRIVER: { auto *api = reinterpret_cast(record); - if (api->start != 0 && api->end != 0) - // -1 device id represents CUDA api call - tracer->AddCPURecords( + if (api->start != 0 && api->end != 0) { + // -1 device id represents ActiveKind api call + tracer->AddActiveKindRecords( DriverKind(api->cbid), api->start, api->end, -1, - GetThreadIdFromSystemThreadId(api->threadId)); + GetThreadIdFromSystemThreadId(api->threadId), + api->correlationId); + } break; } case CUPTI_ACTIVITY_KIND_RUNTIME: { auto *api = reinterpret_cast(record); - if (api->start != 0 && api->end != 0) - tracer->AddCPURecords( + if (api->start != 0 && api->end != 0) { + // -1 device id represents ActiveKind api call + tracer->AddActiveKindRecords( RuntimeKind(api->cbid), api->start, api->end, -1, - GetThreadIdFromSystemThreadId(api->threadId)); + GetThreadIdFromSystemThreadId(api->threadId), + api->correlationId); + } break; } default: { break; } @@ -313,6 +318,43 @@ class DeviceTracerImpl : public DeviceTracer { stream_id, correlation_id, bytes}); } + void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes, + const Place &place, const std::string &alloc_in, + const std::string &free_in, int64_t thread_id) { + if (0 == start_ns || 0 == end_ns) { + VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced."; + return; + } + thread_local std::forward_list *local_mem_info_record = + nullptr; + if (local_mem_info_record == nullptr) { + std::lock_guard l(trace_mu_); + mem_info_record_.emplace_front(); + local_mem_info_record = &mem_info_record_.front(); + } + local_mem_info_record->emplace_front(MemInfoRecord{ + start_ns, end_ns, bytes, place, thread_id, alloc_in, free_in}); + } + + void AddActiveKindRecords(const std::string &anno, uint64_t start_ns, + uint64_t end_ns, int64_t device_id, + int64_t thread_id, uint32_t correlation_id) { + if (anno.empty()) { + VLOG(1) << "Empty timeline annotation."; + return; + } + thread_local std::forward_list + *local_active_kind_records = nullptr; + if (local_active_kind_records == nullptr) { + std::lock_guard l(trace_mu_); + active_kind_records_.emplace_front(); + local_active_kind_records = &active_kind_records_.front(); + } + // lock is not needed, only one thread call this function. + local_active_kind_records->push_front(ActiveKindRecord{ + anno, start_ns, end_ns, device_id, thread_id, correlation_id}); + } + void AddKernelRecords(std::string name, uint64_t start, uint64_t end, int64_t device_id, int64_t stream_id, uint32_t correlation_id) { @@ -355,6 +397,7 @@ class DeviceTracerImpl : public DeviceTracer { } const std::vector cbids { CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020, + CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020, @@ -385,6 +428,8 @@ class DeviceTracerImpl : public DeviceTracer { correlations_.clear(); for (auto &tmp : correlations_pairs) tmp.clear(); for (auto &tmp : cpu_records_) tmp.clear(); + for (auto &tmp : mem_info_record_) tmp.clear(); + for (auto &tmp : active_kind_records_) tmp.clear(); } void GenEventKernelCudaElapsedTime() { @@ -415,9 +460,12 @@ class DeviceTracerImpl : public DeviceTracer { proto::Profile profile_pb; profile_pb.set_start_ns(start_ns_); profile_pb.set_end_ns(end_ns_); - if (correlations_.empty()) - for (auto &tmp : correlations_pairs) + if (correlations_.empty()) { + for (auto &tmp : correlations_pairs) { for (auto &pair : tmp) correlations_[pair.first] = pair.second; + } + } + for (const KernelRecord &r : kernel_records_) { auto *event = profile_pb.add_events(); event->set_type(proto::Event::GPUKernel); @@ -437,7 +485,8 @@ class DeviceTracerImpl : public DeviceTracer { event->set_device_id(r.device_id); } VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find; - for (auto &tmp : cpu_records_) + + for (auto &tmp : cpu_records_) { for (const CPURecord &r : tmp) { auto *event = profile_pb.add_events(); event->set_type(proto::Event::CPU); @@ -447,6 +496,25 @@ class DeviceTracerImpl : public DeviceTracer { event->set_sub_device_id(r.thread_id); event->set_device_id(r.device_id); } + } + + for (auto &tmp : active_kind_records_) { + for (const ActiveKindRecord &r : tmp) { + auto *event = profile_pb.add_events(); + event->set_type(proto::Event::CPU); + auto c = correlations_.find(r.correlation_id); + if (c != correlations_.end() && c->second != nullptr) { + event->set_name(c->second->name()); + event->set_detail_info(r.name); + } else { + event->set_name(r.name); + } + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_sub_device_id(r.thread_id); + event->set_device_id(r.device_id); + } + } miss = find = 0; for (const MemRecord &r : mem_records_) { auto *event = profile_pb.add_events(); @@ -467,6 +535,31 @@ class DeviceTracerImpl : public DeviceTracer { event->mutable_memcopy()->set_bytes(r.bytes); } VLOG(1) << "MemRecord event miss: " << miss << " find: " << find; + + for (auto &tmp : mem_info_record_) { + for (const auto &r : tmp) { + auto *event = profile_pb.add_mem_events(); + event->set_device_id(0); + if (platform::is_cpu_place(r.place)) { + event->set_place(proto::MemEvent::CPUPlace); + } else if (platform::is_gpu_place(r.place)) { + event->set_place(proto::MemEvent::CUDAPlace); + event->set_device_id( + boost::get(r.place).GetDeviceId()); + } else if (platform::is_cuda_pinned_place(r.place)) { + event->set_place(proto::MemEvent::CUDAPinnedPlace); + } else { + PADDLE_THROW("The current place is not supported."); + } + event->set_alloc_in(r.alloc_in); + event->set_free_in(r.free_in); + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_bytes(r.bytes); + event->set_thread_id(r.thread_id); + } + } + std::ofstream profile_f; profile_f.open(profile_path, std::ios::out | std::ios::trunc | std::ios::binary); @@ -510,6 +603,8 @@ class DeviceTracerImpl : public DeviceTracer { std::forward_list kernel_records_; std::forward_list mem_records_; std::forward_list> cpu_records_; + std::forward_list> mem_info_record_; + std::forward_list> active_kind_records_; std::forward_list>> correlations_pairs; std::unordered_map correlations_; @@ -531,7 +626,7 @@ Event *CurAnnotation() { return annotation_stack.back(); } std::string CurAnnotationName() { - if (annotation_stack.empty()) return ""; + if (annotation_stack.empty()) return "Unknown"; return annotation_stack.back()->name(); } @@ -613,6 +708,7 @@ void initCuptiCbidStr() { REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020); REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020); REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020); + REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010); #if CUDA_VERSION >= 9000 REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000); REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000); diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index d4418d836d66e329af8ed3f5ec05f49d47146b3e..85168a046fb3fa4317956737871cde56e15bedfb 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/event.h" +#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.pb.h" @@ -47,6 +48,7 @@ class DeviceTracer { int64_t stream_id; uint32_t correlation_id; }; + struct CPURecord { std::string name; uint64_t start_ns; @@ -54,6 +56,7 @@ class DeviceTracer { int64_t device_id; int64_t thread_id; }; + struct MemRecord { std::string name; uint64_t start_ns; @@ -64,6 +67,25 @@ class DeviceTracer { uint64_t bytes; }; + struct MemInfoRecord { + uint64_t start_ns; + uint64_t end_ns; + size_t bytes; + Place place; + int64_t thread_id; + std::string alloc_in; + std::string free_in; + }; + + struct ActiveKindRecord { + std::string name; + uint64_t start_ns; + uint64_t end_ns; + int64_t device_id; + int64_t thread_id; + uint32_t correlation_id; + }; + virtual ~DeviceTracer() {} // Needs to be called once before use. virtual void Enable() = 0; @@ -85,6 +107,16 @@ class DeviceTracer { virtual void AddCPURecords(const std::string& anno, uint64_t start_ns, uint64_t end_ns, int64_t device_id, int64_t thread_id) = 0; + virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns, + uint64_t end_ns, int64_t device_id, + int64_t thread_id, + uint32_t correlation_id) = 0; + + virtual void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, + size_t bytes, const Place& place, + const std::string& alloc_in, + const std::string& free_in, + int64_t thread_id) = 0; // Add a cuda kernel stats. `correlation_id` will be mapped to annotation // added before for human readability. diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index 2dcf966754cbed2670acb9c3548c23355be5503c..e9bdb82a50fa4166cecdaea1de01d2f458f3da9a 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -13,10 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include #ifdef PADDLE_WITH_CUDA #include #endif +#include "paddle/fluid/platform/place.h" namespace paddle { namespace platform { @@ -64,5 +66,36 @@ class Event { #endif #endif }; + +class MemEvent { + public: + MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes, + Place place, int64_t thread_id, const std::string& annotation) + : type_(type), + start_ns_(start_ns), + end_ns_(end_ns), + bytes_(bytes), + place_(place), + thread_id_(thread_id), + annotation_(annotation) {} + + const EventType& type() const { return type_; } + uint64_t start_ns() const { return start_ns_; } + uint64_t end_ns() const { return end_ns_; } + size_t bytes() const { return bytes_; } + Place place() const { return place_; } + int64_t thread_id() const { return thread_id_; } + const std::string& annotation() const { return annotation_; } + + private: + EventType type_; + uint64_t start_ns_ = 0; + uint64_t end_ns_ = 0; + size_t bytes_; + Place place_; + int64_t thread_id_; + std::string annotation_; +}; + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 400a6d7bfa5912774c4bbb2a5868dd9a471afd00..47cca879b4b71f58778cf3d1f24cab463ac73418 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" - #include #include #include @@ -31,6 +30,8 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f; constexpr static float fraction_of_gpu_memory_to_use = 0.5f; #endif +constexpr static float fraction_reserve_gpu_memory = 0.05f; + DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, "Allocate a trunk of gpu memory that is this fraction of the " "total gpu memory size. Future memory usage will be allocated " @@ -38,6 +39,24 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, "additional trunks of the same size will be requested from gpu " "until the gpu has no memory left for another trunk."); +DEFINE_uint64( + initial_gpu_memory_in_mb, 0ul, + "Allocate a trunk of gpu memory whose byte size is specified by " + "the flag. Future memory usage will be allocated from the " + "truck. If the trunk doesn't have enough gpu memory, additional " + "trunks of the gpu memory will be requested from gpu with size " + "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has " + "no memory left for the additional trunk. Note: if you set this " + "flag, the memory size set by " + "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this " + "flag. If you don't set this flag, PaddlePaddle will use " + "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory"); + +DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul, + "If this flag is set, Paddle will reallocate the gpu memory with " + "size specified by this flag. Else Paddle will reallocate by " + "FLAGS_fraction_of_gpu_memory_to_use"); + DEFINE_bool( enable_cublas_tensor_op_math, false, "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " @@ -180,13 +199,43 @@ void GpuMemoryUsage(size_t *available, size_t *total) { } size_t GpuMaxAllocSize() { + return std::max(GpuInitAllocSize(), GpuReallocSize()); +} + +size_t GpuInitAllocSize() { + if (FLAGS_initial_gpu_memory_in_mb > 0ul) { + // Initial memory will be allocated by FLAGS_initial_gpu_memory_in_mb + return static_cast(FLAGS_initial_gpu_memory_in_mb << 20); + } + + // FLAGS_initial_gpu_memory_in_mb is 0, initial memory will be allocated by + // fraction size_t total = 0; size_t available = 0; GpuMemoryUsage(&available, &total); + size_t reserving = static_cast(fraction_reserve_gpu_memory * total); - // Reserve the rest for page tables, etc. - return static_cast(total * FLAGS_fraction_of_gpu_memory_to_use); + return static_cast((total - reserving) * + FLAGS_fraction_of_gpu_memory_to_use); +} + +size_t GpuReallocSize() { + if (FLAGS_reallocate_gpu_memory_in_mb > 0ul) { + // Additional memory will be allocated by FLAGS_reallocate_gpu_memory_in_mb + return static_cast(FLAGS_reallocate_gpu_memory_in_mb << 20); + } + + // FLAGS_reallocate_gpu_memory_in_mb is 0, additional memory will be allocated + // by fraction + size_t total = 0; + size_t available = 0; + + GpuMemoryUsage(&available, &total); + size_t reserving = static_cast(fraction_reserve_gpu_memory * total); + + return static_cast((total - reserving) * + FLAGS_fraction_of_gpu_memory_to_use); } size_t GpuMinChunkSize() { @@ -201,16 +250,13 @@ size_t GpuMaxChunkSize() { GpuMemoryUsage(&available, &total); VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/" << total / 1024 / 1024 << "M"; - size_t reserving = static_cast(0.05 * total); + size_t reserving = static_cast(fraction_reserve_gpu_memory * total); // If available less than minimum chunk size, no usable memory exists. available = std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(), total - reserving); - // Reserving the rest memory for page tables, etc. - - size_t allocating = static_cast(FLAGS_fraction_of_gpu_memory_to_use * - (total - reserving)); + size_t allocating = GpuMaxAllocSize(); PADDLE_ENFORCE_LE(allocating, available, "Insufficient GPU memory to allocation."); diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index 1e1ab2503f53fe20bbe62c48f65d8535947f1aa8..d4be7ac97b2df6fe578582ae296e1dfc5548260c 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -60,6 +60,12 @@ void GpuMemoryUsage(size_t *available, size_t *total); //! Get the maximum allocation size of current GPU device. size_t GpuMaxAllocSize(); +//! Get the initial allocation size of current GPU device. +size_t GpuInitAllocSize(); + +//! Get the re-allocation size of current GPU device. +size_t GpuReallocSize(); + //! Get the minimum chunk size for GPU buddy allocator. size_t GpuMinChunkSize(); diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 4dcf7e79043af008cb2067d90d12d629c5c2d0d9..407d1b1299855712d9877e59ed192c000b001036 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include // for strdup #include +#include +#include #include #include @@ -29,6 +31,10 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/piece.h" +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "dgc/dgc.h" +#endif + DEFINE_int32(paddle_num_threads, 1, "Number of threads for each paddle instance."); DEFINE_int32(multiple_of_cupti_buffer_size, 1, @@ -41,6 +47,10 @@ namespace framework { std::once_flag gflags_init_flag; std::once_flag p2p_init_flag; +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +std::once_flag dgc_init_flag; +#endif + void InitGflags(std::vector argv) { std::call_once(gflags_init_flag, [&]() { FLAGS_logtostderr = true; @@ -140,6 +150,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { places.emplace_back(platform::CPUPlace()); platform::DeviceContextPool::Init(places); platform::DeviceTemporaryAllocator::Init(); + #ifndef PADDLE_WITH_MKLDNN platform::SetNumThreads(FLAGS_paddle_num_threads); #endif @@ -200,5 +211,15 @@ void InitGLOG(const std::string &prog_name) { #endif } +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +void InitDGC() { + std::call_once(dgc_init_flag, []() { + PADDLE_ENFORCE(paddle::communication::dgc::dynloadNcclLib()); + }); +} +#else +void InitDGC() {} +#endif + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h index 0e30594672927253cc8083dcb88bb867d63ec729..01d66f57dc96c30b474e8a794e375677594ff5f5 100644 --- a/paddle/fluid/platform/init.h +++ b/paddle/fluid/platform/init.h @@ -30,5 +30,7 @@ void InitDevices(bool init_p2p); void InitDevices(bool init_p2p, const std::vector devices); +void InitDGC(); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 6ae21ee8294bedc388f837aad3e20a2b9aca98a2..b8b14b3d15efb47cbf53a393476f25158ebb5dff 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -16,10 +16,13 @@ #pragma once #include +#include #include #include // NOLINT #include +#include #include + #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/enforce.h" @@ -77,6 +80,7 @@ struct NCCLContext { : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {} cudaStream_t stream() const { return ctx_->stream(); } + ncclComm_t comm() const { return comm_; } int device_id() const { return boost::get(ctx_->GetPlace()).device; @@ -101,9 +105,6 @@ struct NCCLContextMap { order_.size(), contexts_.size(), "NCCL Context Map does not support contain two or more same device"); - if (places.size() <= 1 && num_trainers == 1) { - return; - } std::unique_ptr comms(new ncclComm_t[order_.size()]); // if num_trainers == 1, should create a new nccl id for local comms. if (num_trainers == 1 && nccl_id == nullptr) { @@ -123,8 +124,8 @@ struct NCCLContextMap { } else { rank = trainer_id; } - VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks - << "gpu id: " << gpu_id; + VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks + << " gpu id: " << gpu_id; PADDLE_ENFORCE(cudaSetDevice(gpu_id)); PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( comms.get() + i, nranks, *nccl_id, rank)); diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 9a285a6b533dcb48013e3b3e4d34dc27186173ac..6d055a442106d88af39c771f3ddf156ba616c99f 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler.h" - #include #include #include @@ -21,6 +20,8 @@ limitations under the License. */ #include // NOLINT #include #include +#include + #ifdef PADDLE_WITH_CUDA #include #endif // PADDLE_WITH_CUDA @@ -36,8 +37,6 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); namespace paddle { namespace platform { -struct EventList; - static int64_t profiler_lister_id = 0; static bool should_send_profile_state = false; std::mutex profiler_mu; @@ -53,43 +52,15 @@ static uint32_t g_next_thread_id = 0; // The global mutex static std::mutex g_all_event_lists_mutex; // The total event lists of all threads -static std::list> g_all_event_lists; +static std::list>> g_all_event_lists; // The thread local event list only can be accessed by the specific thread -static thread_local std::shared_ptr g_event_list; - -struct EventList { - constexpr static size_t kMB = 1024 * 1024; - constexpr static size_t kEventBlockSize = 16 * kMB; - constexpr static size_t kEventSize = sizeof(Event); - constexpr static size_t kEventAlign = alignof(Event); - constexpr static size_t kNumBlock = - kEventBlockSize / - ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); - - template - Event* Record(Args&&... args) { - if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) { - event_blocks.emplace_front(); - event_blocks.front().reserve(kNumBlock); - } - event_blocks.front().emplace_back(std::forward(args)...); - return &event_blocks.front().back(); - } - - std::vector Reduce() { - std::vector result; - for (auto& block : event_blocks) { - result.insert(result.begin(), std::make_move_iterator(block.begin()), - std::make_move_iterator(block.end())); - } - event_blocks.clear(); - return result; - } +static thread_local std::shared_ptr> g_event_list; - void Clear() { event_blocks.clear(); } - - std::forward_list> event_blocks; -}; +static std::list>> g_all_mem_event_lists; +static thread_local std::shared_ptr> g_mem_event_list; +static std::mutex g_all_mem_event_lists_mutex; +static thread_local int32_t g_mem_thread_id; +static uint32_t g_mem_next_thread_id = 0; inline uint64_t GetTimeInNsec() { using clock = std::conditional &GetMemEventList() { + if (!g_mem_event_list) { + g_mem_event_list = std::make_shared>(); + std::lock_guard guard(g_all_mem_event_lists_mutex); + g_mem_thread_id = g_mem_next_thread_id++; + g_all_mem_event_lists.emplace_front(g_mem_event_list); + } + return *g_mem_event_list; +} + +void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, + const Place &place, const std::string &annotation) { + GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes, + place, g_mem_thread_id, annotation); +} + +void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, + const Place &place, const std::string &annotation) { + GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place, + g_mem_thread_id, annotation); +} + +inline EventList &GetEventList() { if (!g_event_list) { std::lock_guard guard(g_all_event_lists_mutex); - g_event_list = std::make_shared(); + g_event_list = std::make_shared>(); g_thread_id = g_next_thread_id++; g_all_event_lists.emplace_front(g_event_list); RecoreCurThreadId(g_thread_id); @@ -131,26 +124,26 @@ inline EventList& GetEventList() { return *g_event_list; } -void Mark(const std::string& name) { +void Mark(const std::string &name) { GetEventList().Record(EventType::kMark, name, g_thread_id); } -Event* PushEvent(const std::string& name) { +Event *PushEvent(const std::string &name) { return GetEventList().Record(EventType::kPushRange, name, g_thread_id); } -void PopEvent(const std::string& name) { +void PopEvent(const std::string &name) { GetEventList().Record(EventType::kPopRange, name, g_thread_id); } -RecordEvent::RecordEvent(const std::string& name) +RecordEvent::RecordEvent(const std::string &name) : is_enabled_(false), start_ns_(PosixInNsec()) { if (g_state == ProfilerState::kDisabled) return; // lock is not needed, the code below is thread-safe is_enabled_ = true; name_ = name; - Event* e = PushEvent(name_); + Event *e = PushEvent(name_); // Maybe need the same push/pop behavior. SetCurAnnotation(e); } @@ -158,7 +151,7 @@ RecordEvent::RecordEvent(const std::string& name) RecordEvent::~RecordEvent() { if (g_state == ProfilerState::kDisabled || !is_enabled_) return; // lock is not needed, the code below is thread-safe - DeviceTracer* tracer = GetDeviceTracer(); + DeviceTracer *tracer = GetDeviceTracer(); if (tracer) { tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(), BlockDepth(), g_thread_id); @@ -167,7 +160,56 @@ RecordEvent::~RecordEvent() { PopEvent(name_); } -RecordRPCEvent::RecordRPCEvent(const std::string& name) { +MemEvenRecorder MemEvenRecorder::recorder; + +void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place, + size_t size) { + if (g_state == ProfilerState::kDisabled) return; + std::lock_guard guard(mtx_); + auto &events = address_memevent_[place]; + PADDLE_ENFORCE(events.count(ptr) == 0, ""); + events.emplace(ptr, std::unique_ptr( + new MemEvenRecorder::RecordMemEvent(place, size))); +} + +void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) { + if (g_state == ProfilerState::kDisabled) return; + std::lock_guard guard(mtx_); + auto &events = address_memevent_[place]; + auto iter = events.find(ptr); + // The ptr maybe not in address_memevent + if (iter != events.end()) { + events.erase(iter); + } +} + +void MemEvenRecorder::Flush() { + std::lock_guard guard(mtx_); + address_memevent_.clear(); +} + +MemEvenRecorder::RecordMemEvent::RecordMemEvent(const Place &place, + size_t bytes) + : place_(place), + bytes_(bytes), + start_ns_(PosixInNsec()), + alloc_in_(CurAnnotationName()) { + PushMemEvent(start_ns_, end_ns_, bytes_, place_, alloc_in_); +} + +MemEvenRecorder::RecordMemEvent::~RecordMemEvent() { + DeviceTracer *tracer = GetDeviceTracer(); + end_ns_ = PosixInNsec(); + + auto annotation_free = CurAnnotationName(); + if (tracer) { + tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_, + annotation_free, g_mem_thread_id); + } + PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free); +} + +RecordRPCEvent::RecordRPCEvent(const std::string &name) { if (FLAGS_enable_rpc_profiler) { event_.reset(new platform::RecordEvent(name)); } @@ -185,7 +227,7 @@ RecordBlock::RecordBlock(int block_id) RecordBlock::~RecordBlock() { // lock is not needed, the code below is thread-safe if (g_state == ProfilerState::kDisabled || !is_enabled_) return; - DeviceTracer* tracer = GetDeviceTracer(); + DeviceTracer *tracer = GetDeviceTracer(); if (tracer) { // We try to put all blocks at the same nested depth in the // same timeline lane. and distinguish the using thread_id. @@ -232,11 +274,16 @@ void EnableProfiler(ProfilerState state) { void ResetProfiler() { SynchronizeAllDevice(); GetDeviceTracer()->Reset(); + MemEvenRecorder::Instance().Flush(); std::lock_guard guard(g_all_event_lists_mutex); for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); ++it) { (*it)->Clear(); } + for (auto it = g_all_mem_event_lists.begin(); + it != g_all_mem_event_lists.end(); ++it) { + (*it)->Clear(); + } } std::vector> GetAllEvents() { @@ -249,6 +296,15 @@ std::vector> GetAllEvents() { return result; } +std::vector> GetMemEvents() { + std::lock_guard guard(g_all_mem_event_lists_mutex); + std::vector> result; + for (auto &it : g_all_mem_event_lists) { + result.emplace_back((*it).Reduce()); + } + return result; +} + // The information of each event given in the profiling report struct EventItem { std::string name; @@ -263,8 +319,8 @@ struct EventItem { }; // Print results -void PrintProfiler(const std::vector>& events_table, - const std::string& sorted_domain, const size_t name_width, +void PrintProfiler(const std::vector> &events_table, + const std::string &sorted_domain, const size_t name_width, const size_t data_width, bool merge_thread) { // Output header information std::cout << "\n------------------------->" @@ -302,7 +358,7 @@ void PrintProfiler(const std::vector>& events_table, << std::setw(data_width) << "Ratio." << std::endl; for (size_t i = 0; i < events_table.size(); ++i) { for (size_t j = 0; j < events_table[i].size(); ++j) { - const EventItem& event_item = events_table[i][j]; + const EventItem &event_item = events_table[i][j]; std::cout << std::setw(name_width) << event_item.name << std::setw(data_width) << event_item.calls << std::setw(data_width) << event_item.total_time; @@ -326,54 +382,54 @@ void PrintProfiler(const std::vector>& events_table, } // Parse the event list and output the profiling report -void ParseEvents(const std::vector>& events, +void ParseEvents(const std::vector> &events, bool merge_thread, EventSortingKey sorted_by = EventSortingKey::kDefault) { if (g_state == ProfilerState::kDisabled) return; if (merge_thread && events.size() < 2) return; std::string sorted_domain; - std::function sorted_func; + std::function sorted_func; switch (sorted_by) { case EventSortingKey::kCalls: sorted_domain = "number of calls"; - sorted_func = [](const EventItem& a, const EventItem& b) { + sorted_func = [](const EventItem &a, const EventItem &b) { return a.calls > b.calls; }; break; case EventSortingKey::kTotal: sorted_domain = "total time"; - sorted_func = [](const EventItem& a, const EventItem& b) { + sorted_func = [](const EventItem &a, const EventItem &b) { return a.total_time > b.total_time; }; break; case EventSortingKey::kMin: sorted_domain = "minimum time"; - sorted_func = [](const EventItem& a, const EventItem& b) { + sorted_func = [](const EventItem &a, const EventItem &b) { return a.min_time > b.min_time; }; break; case EventSortingKey::kMax: sorted_domain = "maximum time"; - sorted_func = [](const EventItem& a, const EventItem& b) { + sorted_func = [](const EventItem &a, const EventItem &b) { return a.max_time > b.max_time; }; break; case EventSortingKey::kAve: sorted_domain = "average time"; - sorted_func = [](const EventItem& a, const EventItem& b) { + sorted_func = [](const EventItem &a, const EventItem &b) { return a.ave_time > b.ave_time; }; break; case EventSortingKey::kGPUTime: sorted_domain = "average time"; - sorted_func = [](const EventItem& a, const EventItem& b) { + sorted_func = [](const EventItem &a, const EventItem &b) { return a.gpu_time > b.gpu_time; }; break; case EventSortingKey::kCPUTime: sorted_domain = "average time"; - sorted_func = [](const EventItem& a, const EventItem& b) { + sorted_func = [](const EventItem &a, const EventItem &b) { return a.cpu_time > b.cpu_time; }; break; @@ -381,7 +437,7 @@ void ParseEvents(const std::vector>& events, sorted_domain = "event first end time"; } - const std::vector>* analyze_events; + const std::vector> *analyze_events; std::vector> merged_events_list; if (merge_thread) { std::vector merged_events; @@ -469,7 +525,7 @@ void ParseEvents(const std::vector>& events, } } // average time - for (auto& item : event_items) { + for (auto &item : event_items) { item.ave_time = item.total_time / item.calls; item.ratio = item.total_time / total; } @@ -493,15 +549,77 @@ void ParseEvents(const std::vector>& events, merge_thread); } +struct MemoryProfierReport { + size_t alloc_times{0}; + size_t alloc_size{0}; + size_t free_times{0}; + size_t free_size{0}; +}; + +// Print results +void PrintMemProfiler( + const std::map> + &annotation_report, + const size_t name_width, const size_t data_width) { + // Output header information + std::cout << "\n------------------------->" + << " Memory Profiling Report " + << "<-------------------------\n\n"; + + // Output events table + std::cout.setf(std::ios::left); + std::cout << std::setw(name_width) << "Event" << std::setw(data_width) + << "Alloc Calls" << std::setw(data_width) << "Size(MB)" + << std::setw(data_width) << "Free Calls" << std::setw(data_width) + << "Size(MB)" << std::endl; + + for (auto &tmp : annotation_report) { + for (auto &e : tmp.second) { + auto event_name = string::Sprintf("%s:%s", tmp.first, e.first); + std::cout << std::setw(name_width) << event_name; + std::cout << std::setw(data_width) << e.second.alloc_times; + std::cout << std::setw(data_width) + << e.second.alloc_size / (1024.0 * 1024.0); + std::cout << std::setw(data_width) << e.second.free_times; + std::cout << std::setw(data_width) + << e.second.free_size / (1024.0 * 1024.0) << std::endl; + } + } + std::cout << std::endl; +} + +// parse memory events +void ParseMemEvents(const std::vector> &events) { + if (g_state == ProfilerState::kDisabled) return; + // place, annotation, alloc times, alloc size + std::map> + annotation_report; + + for (auto &tmp : events) { + for (auto &e : tmp) { + if (e.type() == EventType::kPushRange) { + annotation_report[e.place()][e.annotation()].alloc_times += 1; + annotation_report[e.place()][e.annotation()].alloc_size += e.bytes(); + } else if (e.type() == EventType::kPopRange) { + annotation_report[e.place()][e.annotation()].free_times += 1; + annotation_report[e.place()][e.annotation()].free_size += e.bytes(); + } + } + } + PrintMemProfiler(annotation_report, 55, 18); +} + void DisableProfiler(EventSortingKey sorted_key, - const std::string& profile_path) { + const std::string &profile_path) { SynchronizeAllDevice(); + MemEvenRecorder::Instance().Flush(); + std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled) return; // Mark the profiling stop. Mark("_stop_profiler_"); - DeviceTracer* tracer = GetDeviceTracer(); + DeviceTracer *tracer = GetDeviceTracer(); if (tracer->IsEnabled()) { tracer->Disable(); tracer->GenProfile(profile_path); @@ -511,6 +629,11 @@ void DisableProfiler(EventSortingKey sorted_key, std::vector> all_events = GetAllEvents(); ParseEvents(all_events, true, sorted_key); ParseEvents(all_events, false, sorted_key); + if (VLOG_IS_ON(5)) { + std::vector> all_mem_events = GetMemEvents(); + ParseMemEvents(all_mem_events); + } + ResetProfiler(); g_state = ProfilerState::kDisabled; should_send_profile_state = true; diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index aec0ae34292d62905de0e1f459b2b6db4554ebb7..8d11855b70de824159f19f2997b876564e7719b1 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -15,10 +15,17 @@ limitations under the License. */ #pragma once #include #include +#include +#include +#include // NOLINT #include +#include +#include +#include #include #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/event.h" +#include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/gpu_info.h" #endif @@ -34,8 +41,41 @@ enum ProfilerState { void Mark(const std::string& name); -Event* PushEvent(const std::string& name); +void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, + const Place& place); +void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, + const Place& place); + +struct MemEvenRecorder { + public: + void PushMemRecord(const void* ptr, const Place& place, size_t size); + void PopMemRecord(const void* ptr, const Place& place); + void Flush(); + static MemEvenRecorder& Instance() { return recorder; } + private: + struct RecordMemEvent { + RecordMemEvent(const Place& place, size_t bytes); + ~RecordMemEvent(); + + Place place_; + size_t bytes_; + uint64_t start_ns_; + uint64_t end_ns_; + std::string alloc_in_; + std::string free_in_; + }; + + static MemEvenRecorder recorder; + std::map>> + address_memevent_; + std::mutex mtx_; + MemEvenRecorder() {} + DISABLE_COPY_AND_ASSIGN(MemEvenRecorder); +}; + +Event* PushEvent(const std::string& name); void PopEvent(const std::string& name); struct RecordEvent { @@ -87,6 +127,41 @@ enum EventSortingKey { kGPUTime }; +template +struct EventList { + constexpr static size_t kMB = 1024 * 1024; + constexpr static size_t kEventBlockSize = 16 * kMB; + constexpr static size_t kEventSize = sizeof(T); + constexpr static size_t kEventAlign = alignof(T); + constexpr static size_t kNumBlock = + kEventBlockSize / + ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); + + template + T* Record(Args&&... args) { + if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) { + event_blocks.emplace_front(); + event_blocks.front().reserve(kNumBlock); + } + event_blocks.front().emplace_back(std::forward(args)...); + return &event_blocks.front().back(); + } + + std::vector Reduce() { + std::vector result; + for (auto& block : event_blocks) { + result.insert(result.begin(), std::make_move_iterator(block.begin()), + std::make_move_iterator(block.end())); + } + event_blocks.clear(); + return result; + } + + void Clear() { event_blocks.clear(); } + + std::forward_list> event_blocks; +}; + // Enable the profiling function. void EnableProfiler(ProfilerState state); diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto index e761d7b266e92fd5d47b5b6073ffc8bea1dc877d..cfa3c6906f83f750c8d6dc654f29b8fe95ec17ac 100644 --- a/paddle/fluid/platform/profiler.proto +++ b/paddle/fluid/platform/profiler.proto @@ -34,8 +34,25 @@ message Event { optional string detail_info = 9; } +message MemEvent { + enum Place { + CUDAPlace = 0; + CPUPlace = 1; + CUDAPinnedPlace = 2; + } + optional uint64 start_ns = 1; + optional uint64 end_ns = 2; + optional uint64 bytes = 3; + optional Place place = 4; + optional uint64 thread_id = 5; + optional uint32 device_id = 6; + optional string alloc_in = 7; + optional string free_in = 8; +} + message Profile { repeated Event events = 1; optional uint64 start_ns = 2; optional uint64 end_ns = 3; + repeated MemEvent mem_events = 4; } \ No newline at end of file diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc index 9cbdfe46e78dc84e58eae6929c887221d9562c69..ddde7baf4cf3b44ac5d8a22fcc98acef50294575 100644 --- a/paddle/fluid/platform/temporary_allocator.cc +++ b/paddle/fluid/platform/temporary_allocator.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/platform/temporary_allocator.h" +#include #include "paddle/fluid/memory/allocation/allocator_facade.h" DEFINE_int64(limit_of_tmp_allocation, -1, @@ -29,38 +30,31 @@ namespace paddle { namespace platform { namespace alloc = memory::allocation; -TemporaryAllocation::TemporaryAllocation( - alloc::AllocationPtr &&underlying_allocation) - : Allocation(underlying_allocation->ptr(), underlying_allocation->size(), - underlying_allocation->place()), - underlying_allocation_(std::move(underlying_allocation)) {} - TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) { - temp_mem_map_.reset(new std::multimap()); + temp_mem_map_.reset(new std::multimap()); } bool TemporaryAllocator::IsAllocThreadSafe() const { return true; } void TemporaryAllocator::Release(const std::function &callback) { - std::unique_ptr> t_allocations; + std::unique_ptr> t_allocations; { std::unique_lock lock(mtx_); callback(); t_allocations.swap(temp_mem_map_); - temp_mem_map_.reset(new std::multimap()); + temp_mem_map_.reset(new std::multimap()); wait_delete_mem_ = 0; } + alloc::AllocationDeleter deleter; for (auto tmp : *t_allocations) { VLOG(10) << "Delete temporary allocation " << tmp.second->ptr() << " size: " << tmp.second->size(); - delete tmp.second; + deleter(tmp.second); } } -void TemporaryAllocator::Free(alloc::Allocation *allocation) { - auto *temp_allocation = dynamic_cast(allocation); - PADDLE_ENFORCE_NOT_NULL(temp_allocation); +void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) { if (platform::is_gpu_place(temp_allocation->place())) { PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_), "The place should be the same."); @@ -84,7 +78,7 @@ void TemporaryAllocator::Free(alloc::Allocation *allocation) { } VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr() << " size: " << temp_allocation->size(); - delete temp_allocation; + alloc::AllocationDeleter()(temp_allocation); } size_t TemporaryAllocator::TemporaryAllocationQueueSize() { @@ -119,11 +113,9 @@ alloc::Allocation *TemporaryAllocator::AllocateImpl( } // If not find the the available allocation, get allocation from // AllocatorFacadeInstance. - auto raw_allocation = - alloc::AllocatorFacade::Instance().Alloc(place_, size, attr); - auto temp_mem = new TemporaryAllocation(std::move(raw_allocation)); + auto temp_mem = alloc::AllocatorFacade::Instance().Alloc(place_, size, attr); VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size; - return temp_mem; + return temp_mem.release(); } } // namespace platform diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h index d657a14223326aa1e2cb5b154a10a56ae742f95c..912d45eaf17fe8c05840995275dd3e2e688b38ef 100644 --- a/paddle/fluid/platform/temporary_allocator.h +++ b/paddle/fluid/platform/temporary_allocator.h @@ -16,20 +16,13 @@ #include // NOLINT #include #include +#include #include // NOLINT #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/lock_guard_ptr.h" namespace paddle { namespace platform { -class TemporaryAllocation : public memory::allocation::Allocation { - public: - explicit TemporaryAllocation( - memory::allocation::AllocationPtr &&underlying_allocation); - - memory::allocation::AllocationPtr underlying_allocation_; -}; - /*! \brief the TemporaryAllocator is used to alloc the temporary allocation * which used by CUDA's async operation. * @@ -56,7 +49,7 @@ class TemporaryAllocator : public memory::allocation::Allocator { void SetCallback(const std::function &callback); protected: - void Free(memory::allocation::Allocation *allocation) override; + void FreeImpl(memory::allocation::Allocation *allocation) override; memory::allocation::Allocation *AllocateImpl( size_t size, memory::allocation::Allocator::Attr attr) override; @@ -65,8 +58,8 @@ class TemporaryAllocator : public memory::allocation::Allocator { platform::Place place_; // When the allocation is not held by any variable, it should be placed // to temp_mem_map immediately. - std::unique_ptr> temp_mem_map_{ - nullptr}; + std::unique_ptr> + temp_mem_map_{nullptr}; std::mutex mtx_; size_t wait_delete_mem_{0}; std::function callback_; diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 4ac5b83c56b114f4e3e4c78710716adc636ebe1d..0991eff0fdaaca80ada2d8dd3c68eba72fd3f6e6 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,11 +1,11 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer scope_pool - tracer analysis_predictor) + tracer analysis_predictor imperative_profiler) if(WITH_PYTHON) list(APPEND PYBIND_DEPS py_func_op) endif() -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc reader_py.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc) if(WITH_PYTHON) if(WITH_AMD_GPU) diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index aeabed19abfda3c857f54e5ada54d52bf95e2602..e9ed4e16443eba481143bd2095f9970bcb167d71 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -13,10 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/pybind/imperative.h" + +#include +#include +#include +#include + #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/type_defs.h" +#include "paddle/fluid/pybind/pybind_boost_headers.h" + namespace paddle { namespace pybind { @@ -30,21 +38,23 @@ void BindTracer(pybind11::module* m) { .def("trace", [](imperative::Tracer& self, imperative::OpBase* op, const imperative::VarBasePtrMap& inputs, - const imperative::VarBasePtrMap& outputs, - framework::BlockDesc* block, + imperative::VarBasePtrMap* outputs, + framework::AttributeMap attrs_map, const platform::CPUPlace expected_place, const bool stop_gradient = false) { - return self.Trace(op, inputs, outputs, block, expected_place, + pybind11::gil_scoped_release release; + return self.Trace(op, inputs, outputs, attrs_map, expected_place, stop_gradient); }) .def("trace", [](imperative::Tracer& self, imperative::OpBase* op, const imperative::VarBasePtrMap& inputs, - const imperative::VarBasePtrMap& outputs, - framework::BlockDesc* block, + imperative::VarBasePtrMap* outputs, + framework::AttributeMap attrs_map, const platform::CUDAPlace expected_place, const bool stop_gradient = false) { - return self.Trace(op, inputs, outputs, block, expected_place, + pybind11::gil_scoped_release release; + return self.Trace(op, inputs, outputs, attrs_map, expected_place, stop_gradient); }) .def("py_trace", &imperative::Tracer::PyTrace, diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h index 8c48b2a7153c566930a074bd0bab1f054c13c2d5..8496cbfcb18798ee8ce1714431b7877bb2b7d377 100644 --- a/paddle/fluid/pybind/imperative.h +++ b/paddle/fluid/pybind/imperative.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include +#include #include #include "paddle/fluid/imperative/layer.h" #include "pybind11/pybind11.h" @@ -36,6 +37,8 @@ class Layer : public imperative::Layer { class PYBIND11_HIDDEN PyOpBase : public imperative::OpBase { public: using imperative::OpBase::OpBase; // Inherit constructors + + PyOpBase(const std::string& name) : OpBase(name) {} }; class PyVarBase : public imperative::VarBase { diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 7db2bb451b49918fd8d92a6036c132d34e965c63..236afc77f708c344665821edd4f7c7841c300465 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -65,7 +65,8 @@ void BindInferenceApi(py::module *m) { void BindPaddleDType(py::module *m) { py::enum_(*m, "PaddleDType") .value("FLOAT32", PaddleDType::FLOAT32) - .value("INT64", PaddleDType::INT64); + .value("INT64", PaddleDType::INT64) + .value("INT32", PaddleDType::INT32); } void BindPaddleBuf(py::module *m) { @@ -103,6 +104,11 @@ void BindPaddleBuf(py::module *m) { int64_t *data = static_cast(self.data()); return {data, data + self.length() / sizeof(*data)}; }) + .def("int32_data", + [](PaddleBuf &self) -> std::vector { + int32_t *data = static_cast(self.data()); + return {data, data + self.length() / sizeof(*data)}; + }) .def("length", &PaddleBuf::length); } @@ -221,7 +227,8 @@ void BindAnalysisConfig(py::module *m) { .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine, py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1, py::arg("min_subgraph_size") = 3, - py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32) + py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, + py::arg("use_static") = true) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug, py::arg("x") = true) diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index 68f74a8531fff0c49c8a62d12f5cde7af77faf8a..c69ccd507210f976c1cb8ad072928b96693a948d 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" @@ -54,12 +55,14 @@ void BindGraph(py::module *m) { "The graph is a Directed Acyclic Single Static Assignment Graph, see " "`paddle::ir::Graph` for details.") .def(py::init()) + .def("clone", &Graph::Clone) .def("has", &Graph::Has) .def("get_int", &Graph::Get) .def("get_float", &Graph::Get) .def("get_double", &Graph::Get) .def("get_string", &Graph::Get) - .def("get_marked_nodes", &Graph::Get>) + .def("get_marked_nodes", &Graph::Get>, + return_value_policy::reference) .def("set", [](Graph &self, const std::string &attr_name, int attr) { return self.Set(attr_name, new int(attr)); }) .def("set", @@ -103,7 +106,8 @@ void BindGraph(py::module *m) { .def("retrieve_node", &Graph::RetrieveNode, return_value_policy::reference) .def("resolve_hazard", &Graph::ResolveHazard) - .def("origin_program_desc", &Graph::OriginProgram); + .def("origin_program_desc", &Graph::OriginProgram, + return_value_policy::reference); } void BindNode(py::module *m) { diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index e729be4a95a58510f1e0162af4216feaa400d971..31b5dd5d7c053d369bec6dac2c5ba0e73d7ddd60 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -23,97 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/var_desc.h" -// Cast boost::variant for PyBind. -// Copy from -// https://github.com/pybind/pybind11/issues/576#issuecomment-269563199 -namespace pybind11 { -namespace detail { - -#if !defined(PYBIND11_HIDDEN) -#ifdef _WIN32 -#define PYBIND11_HIDDEN __declspec(dllexport) -#else -#define PYBIND11_HIDDEN __attribute__((visibility("hidden"))) -#endif -#endif - -// Can be replaced by a generic lambda in C++14 -struct PYBIND11_HIDDEN paddle_variant_caster_visitor - : public boost::static_visitor { - return_value_policy policy; - handle parent; - - paddle_variant_caster_visitor(return_value_policy policy, handle parent) - : policy(policy), parent(parent) {} - - template - handle operator()(T const &src) const { - return make_caster::cast(src, policy, parent); - } -}; - -template -struct paddle_variant_caster; - -template